def main(sto_filename, output_prefix, ntaxa, cmdf, shuffle_iter): """ Takes <sto_filename>, randomly picks <ntaxa> species, """ msa = MSA(sto_filename) n = random.sample(range(msa.nseq), ntaxa) msa.nseq = ntaxa msa.ids = ['T'+str(i) for i in xrange(ntaxa)] msa.aln = [msa.aln[i] for i in n] msa.trim_gaps(removeAmbs=True, threshold=1.) # remove just ambs and all-gap cols file1 = output_prefix + '.original.sto' msa.write_stockholm(file1) cmdf.write("python $GBPML/run_pfold.py " + file1 + '\n') cmdf.write("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree\n".format(file1)) cmdf.write("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree\n".format(file1)) cmdf.write("bash $GBPML/run_dnaml-erate.sh " + file1 + '\n') cmdf.write("find {0}*.tree|xargs -n1 -i bash $GBPML/scripts/run_pscore.sh {0} {{}}\n".format(file1)) if shuffle_iter > 0: for iter in xrange(shuffle_iter): file2 = output_prefix + '.shuffle_iter' + str(iter) + '.sto' msa.shuffle_cols() msa.write_stockholm(file2) cmdf.write("python $GBPML/run_pfold.py " + file2 + '\n') cmdf.write("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree\n".format(file2)) cmdf.write("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree\n".format(file2)) cmdf.write("bash $GBPML/run_dnaml-erate.sh " + file2 + '\n') cmdf.write("find {0}*.tree|xargs -n1 -i bash $GBPML/scripts/run_pscore.sh {0} {{}}\n".format(file2)) return True
def main(sto_filename, output_prefix, trim_gap_threshold, singlify_threshold, ntaxa, d, cmdf): """ Preparing for the rRNA concordance test. Takes <sto_filename>, randomly picks <ntaxa> species, a) removes all paired cols w/ too little canonical pairs (<singlify_threshold>) or ambiguous code b) removes all unpaired cols w/ too much gaps (<trim_gap_threshold>) or ambiguous code Filles up dict <d> with: <prefix base> --> ids, stats of pre/post filtering """ msa = MSA(sto_filename) n = random.sample(range(msa.nseq), ntaxa) msa.nseq = ntaxa msa.ids = [msa.ids[i] for i in n] msa.aln = [msa.aln[i] for i in n] d_key = os.path.basename(output_prefix) d[d_key] = {} d[d_key]['ids'] = msa.ids d[d_key]['pre'] = msa.get_stats() msa.write_stockholm(output_prefix+'.raw.sto') print >> sys.stderr, "delete pair cols ambiguous or has canonical less than ", singlify_threshold msa.singlify_pairs(singlify_threshold, delete_instead_of_singlify=True, removeAmb=True) print >> sys.stderr, "delete single cols ambiguous or more gaps % than", trim_gap_threshold msa.trim_gaps(removeAmbs=True, threshold=trim_gap_threshold) msa.write_stockholm(output_prefix+'.original.sto') old = msa.get_stats()['single'] d[d_key]['post'] = msa.get_stats() msa.trim_gaps(removeAmbs=True, threshold=trim_gap_threshold) # sanity check assert msa.get_stats()['single'] == old #just sanity check file1, file2 = halve_msa(msa, output_prefix) cmdf.write("python $GBPML/run_pfold.py " + file1 + '\n') cmdf.write("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree\n".format(file1)) cmdf.write("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree\n".format(file1)) cmdf.write("bash $GBPML/run_dnaml-erate.sh " + file1 + '\n') cmdf.write("python $GBPML/run_pfold.py " + file2 + '\n') cmdf.write("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree\n".format(file2)) cmdf.write("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree\n".format(file2)) cmdf.write("bash $GBPML/run_dnaml-erate.sh " + file2 + '\n') # o_sto = output_prefix+'.original.sto' # print("python $GBPML/run_pfold.py " + o_sto) # print("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree".format(o_sto)) # print("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree".format(o_sto)) # print("bash $GBPML/run_dnaml-erate.sh " + o_sto) return True