def main(dirname): from MSA import MSA rex = re.compile('\S+c(\d+)p(\d+)\S+') result = defaultdict(lambda: []) # (# of single, # of paired) --> list of sto entropies just_single = [] # a tally list of MI for each single col just_paired = [] # a tally list of MI for each paired col for fulld in glob.iglob(os.path.join(dirname, '*iter*/')): if not os.path.isdir(fulld): continue for sto in glob.iglob(os.path.join(fulld, '*.sto')): m = rex.match(sto) if m is None: pass # ignore else: num_single = int(m.group(1)) num_paired = int(m.group(2)) msa = MSA(sto) ent = 0 for i in msa.single_cols(): s = calc_entropy([x[i] for x in msa.aln]) just_single.append(s) ent += s for i,j in msa.BP.iteritems(): p = calc_entropy([x[i]+x[j] for x in msa.aln]) just_paired.append(p) ent += p result[(num_single,num_paired)].append(ent) return result, just_single, just_paired
def main(): if len(sys.argv) != 2: print ("Usage: python Main.py <input_file_location>") return print ("INFO: Starting Multiple Sequence Alignment.") performance = dict() start_time = time.time() #input_data = InputData("./input.txt") input_data = InputData(sys.argv[1]) end_time = time.time() performance["input"] = (end_time - start_time, memory_usage()) start_time = time.time() hypercube = Hypercube(input_data.sequences) end_time = time.time() performance["hcube"] = (end_time - start_time, memory_usage()) msa = MSA(hypercube) start_time = time.time() msa.align() end_time = time.time() performance["MSA"] = (end_time - start_time, memory_usage()) for output in msa.output: print (output) output_data = OutputData("./output.txt", msa.output) print ("INFO: Performance: (Execution time [s], Memory usage [MB])") for p in performance: print ("\t{0}: \t{1} s, \t{2} MB".format(p, performance[p][0], performance[p][1])) print ("INFO: Done.") return
def main(msa_filename, tree_filename, single_model_filename=os.path.join(os.environ['LCODE'],'data/single_model'), \ paired_model_filename=os.path.join(os.environ['LCODE'],'data/pair_model')): from MSA import MSA from EvoModel import SingleModel, PairedModel from Tree import * msa = MSA(msa_filename) single_model = SingleModel(single_model_filename) paired_model = PairedModel(paired_model_filename, single_model) # --------------- using newick --------------------- # acc = list(msa.ids) # post_order_traversal(t, acc) # order = acc[msa.nseq:] # -------------- using dendropy ------------------- t2 = dendropy.Tree.get_from_path(tree_filename, 'newick') msa.remove_seqs_not_in_tree([x.taxon.label for x in t2.leaf_nodes()]) t = t2 order = postorder_assign_then_traverse(t, list(msa.ids)) single_cols = xrange(msa.aln_len) paired_cols = msa.BP.items() paired_cols.sort() n = msa.nseq S = init_likelihood(msa, single_cols, single_model) g = MyMat.calc_likelihood # NOTE: NO LONGER logs the single model Frequency! # first calculate the null model (joint indep prob at each position) # TODO: this is not the fastest code ever....but will do for now L_null = [sum(sum(exp(S[:msa.nseq, col, :4]) * log(single_model.Frequency))) for col in single_cols] # convert S into 1d nnode, ncol, nbase = S.shape S = scipy.ascontiguousarray(S.reshape(S.size)) P = init_likelihood_paired(msa, paired_cols, paired_model, nnode) nnode_p, ncol_p, nbase_p = P.shape P = scipy.ascontiguousarray(P.reshape(P.size)) like_s, like_s_n_p, S, P = calc_likelihood(msa, order, single_model, paired_model) # need to use this to set up S, P for rearr return like_s_n_p
args = parser.parse_args() return args if __name__ == "__main__": options = usage() msa_filename = options.msa_filename tree_filename = options.tree_filename single_model_filename = options.single_model paired_model_filename = options.paired_model treat_gap_as_missing = options.treat_gap_as_missing assert 0. < options.trim_gap_threshold <= 1. assert 1 <= options.cpu msa = MSA(msa_filename, options.ignore_bp) msa.trim_gaps(removeAmbs=True, threshold=options.trim_gap_threshold) single_model = SingleModel(single_model_filename) paired_model = PairedModel(paired_model_filename, single_model) # -------------- using dendropy ------------------- t = dendropy.Tree.get_from_path(tree_filename, 'newick') # have to call remove_seqs_not_in_tree becuz sometimes I # will manually trim leaves from the tree msa.remove_seqs_not_in_tree([x.taxon.label for x in t.leaf_nodes()]) # edge lengths of 0 will cause calculation problems... # TODO: better way to handle this? for n in t.nodes(): if n.edge_length <= 0: n.edge_length = 1e-3
def main(output_prefix, trim_gap_threshold, singlify_threshold, d, cmdf): """ Preparing for the rRNA concordance test. Takes <sto_filename>, randomly picks <ntaxa> species, a) removes all paired cols w/ too little canonical pairs (<singlify_threshold>) or ambiguous code b) removes all unpaired cols w/ too much gaps (<trim_gap_threshold>) or ambiguous code Filles up dict <d> with: <prefix base> --> ids, stats of pre/post filtering """ msa = MSA(output_prefix + '.raw.sto') d_key = os.path.basename(output_prefix) d[d_key] = {} d[d_key]['ids'] = msa.ids d[d_key]['pre'] = msa.get_stats() print >> sys.stderr, "delete pair cols ambiguous or has canonical less than ", singlify_threshold msa.singlify_pairs(singlify_threshold, delete_instead_of_singlify=True, removeAmb=True) print >> sys.stderr, "delete single cols ambiguous or more gaps % than", trim_gap_threshold msa.trim_gaps(removeAmbs=True, threshold=trim_gap_threshold) msa.write_stockholm(output_prefix+'.original.sto') old = msa.get_stats()['single'] d[d_key]['post'] = msa.get_stats() msa.trim_gaps(removeAmbs=True, threshold=trim_gap_threshold) # sanity check assert msa.get_stats()['single'] == old #just sanity check file1, file2 = halve_msa(msa, output_prefix) cmdf.write("python $GBPML/run_pfold.py " + file1 + '\n') cmdf.write("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree\n".format(file1)) cmdf.write("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree\n".format(file1)) cmdf.write("bash $GBPML/run_dnaml-erate.sh " + file1 + '\n') cmdf.write("python $GBPML/run_pfold.py " + file2 + '\n') cmdf.write("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree\n".format(file2)) cmdf.write("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree\n".format(file2)) cmdf.write("bash $GBPML/run_dnaml-erate.sh " + file2 + '\n') # o_sto = output_prefix+'.original.sto' # print("python $GBPML/run_pfold.py " + o_sto) # print("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree".format(o_sto)) # print("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree".format(o_sto)) # print("bash $GBPML/run_dnaml-erate.sh " + o_sto) return True
def main(sto_filename, output_prefix, ntaxa, cmdf, shuffle_iter): """ Takes <sto_filename>, randomly picks <ntaxa> species, """ msa = MSA(sto_filename) n = random.sample(range(msa.nseq), ntaxa) msa.nseq = ntaxa msa.ids = ['T'+str(i) for i in xrange(ntaxa)] msa.aln = [msa.aln[i] for i in n] msa.trim_gaps(removeAmbs=True, threshold=1.) # remove just ambs and all-gap cols file1 = output_prefix + '.original.sto' msa.write_stockholm(file1) cmdf.write("python $GBPML/run_pfold.py " + file1 + '\n') cmdf.write("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree\n".format(file1)) cmdf.write("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree\n".format(file1)) cmdf.write("bash $GBPML/run_dnaml-erate.sh " + file1 + '\n') cmdf.write("find {0}*.tree|xargs -n1 -i bash $GBPML/scripts/run_pscore.sh {0} {{}}\n".format(file1)) if shuffle_iter > 0: for iter in xrange(shuffle_iter): file2 = output_prefix + '.shuffle_iter' + str(iter) + '.sto' msa.shuffle_cols() msa.write_stockholm(file2) cmdf.write("python $GBPML/run_pfold.py " + file2 + '\n') cmdf.write("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree\n".format(file2)) cmdf.write("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree\n".format(file2)) cmdf.write("bash $GBPML/run_dnaml-erate.sh " + file2 + '\n') cmdf.write("find {0}*.tree|xargs -n1 -i bash $GBPML/scripts/run_pscore.sh {0} {{}}\n".format(file2)) return True
P = init_likelihood_paired(msa, paired_cols, paired_model, nnode) nnode_p, ncol_p, nbase_p = P.shape P = scipy.ascontiguousarray(P.reshape(P.size)) like_s, like_s_n_p, S, P = calc_likelihood(msa, order, single_model, paired_model) # need to use this to set up S, P for rearr return like_s_n_p if __name__ == "__main__": from MSA import * from EvoModel import * from Tree import * usage() msa = MSA(msa_filename) msa.trim_gaps(0.7) single_model = SingleModel(single_model_filename) paired_model = PairedModel(paired_model_filename, single_model) # --------------- using newick --------------------- # acc = list(msa.ids) # post_order_traversal(t, acc) # order = acc[msa.nseq:] # -------------- using dendropy ------------------- t2 = dendropy.Tree.get_from_path(tree_filename, 'newick') msa.remove_seqs_not_in_tree([x.taxon.label for x in t2.leaf_nodes()]) t = t2 order = postorder_assign_then_traverse(t, list(msa.ids)) raw_input("break")