import argparse import re import sys from errors import filecheck_and_quit, directorycheck_and_quit progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group() desc = 'Read in a SequenceCollection from disk and print scores' input_help = 'Filepath+name of gzipped SequenceCollection object' category_choices = ['Observed', 'Randomised', 'Simulated', 'NA'] parser = argparse.ArgumentParser(prog=progname, description=desc) parser.add_argument('-i', dest='input_file', help=input_help, type=str) parser.add_argument('-t', dest='phyml_dir', help=input_help, type=str) args = parser.parse_args() input_file = args.input_file phyml_dir = args.phyml_dir.rstrip('/') filecheck_and_quit(input_file) directorycheck_and_quit(phyml_dir) from sequence_collection import SequenceCollection sc = SequenceCollection.gunzip(input_file) cluster_records = sc.get_cluster_records() sc.load_phyml_results(phyml_dir, records=cluster_records, use_hashname=True) sc.update_scores() sc.gzip(input_file)
type=str, help=datatype_help) parser.add_argument('-score', dest='score', action='store_true') args = vars(parser.parse_args()) input_dir = args['input'].rstrip('/') output = args['output'] tmpdir = args['tmpdir'].rstrip('/') min_clusters = args['min_clusters'] max_clusters = args['max_clusters'] distance = args['distance'] method = args['cluster_method'] delete = args['delete'] datatype = args['datatype'] score = args['score'] directorycheck_and_quit(input_dir) directorycheck_and_make(tmpdir) gtp_path = os.environ['GTP_PATH'] helper = os.environ['DARWINHELPER'] from sequence_collection import SequenceCollection sc = SequenceCollection( input_dir, file_format='phylip', datatype=datatype, helper=helper, gtp_path=gtp_path, tmpdir=tmpdir, overwrite=True,
################################################################################ import argparse import re import sys from errors import filecheck_and_quit, directorycheck_and_quit progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group() desc = 'Read in a SequenceCollection from disk and print scores' input_help = 'Filepath+name of gzipped SequenceCollection object' category_choices = ['Observed', 'Randomised', 'Simulated', 'NA'] parser = argparse.ArgumentParser(prog=progname, description=desc) parser.add_argument('-i', dest='input_file', help=input_help, type=str) parser.add_argument('-t', dest='phyml_dir', help=input_help, type=str) args = parser.parse_args() input_file = args.input_file phyml_dir = args.phyml_dir.rstrip('/') filecheck_and_quit(input_file) directorycheck_and_quit(phyml_dir) from sequence_collection import SequenceCollection sc = SequenceCollection.gunzip(input_file) cluster_records = sc.get_cluster_records() sc.load_phyml_results(phyml_dir, records=cluster_records, use_hashname=True) sc.update_scores() sc.gzip(input_file)
def simulate_from_record_WAG( cls, record, output_dir, name='tempsim', tmpdir='/tmp', allow_nonsense=False, split_lengths=None, gene_names=None, ): length = record.seqlength tree = record.tree directorycheck_and_quit(tmpdir) gamma = tree.extract_gamma_parameter() param_dir = '{0}/alf_parameter_dir'.format(tmpdir) working_dir = '{0}/alf_working_dir'.format(tmpdir) directorycheck_and_make(param_dir, verbose=False) directorycheck_and_make(working_dir, verbose=False) treefile = '{0}/treefile.nwk'.format(tmpdir) tree.pam2sps('sps2pam').write_to_file(treefile) directorycheck_and_make(param_dir) directorycheck_and_make(working_dir) sim = cls(simulation_name=name, working_directory=working_dir, outfile_path=param_dir, unit_is_pam=True) sim.indels() sim.rate_variation(gamma) sim.root_genome(number_of_genes=1, min_length=length) sim.one_word_model('WAG') sim.custom_tree(treefile) params = sim.write_parameters() sim.runALF(params, quiet=True) tree_newick = tree.newick alf_newick = \ open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir, name)).read() replacement_dict = dict(zip(re.findall(r'(\w+)(?=:)', alf_newick), re.findall(r'(\w+)(?=:)', tree_newick))) # bug correction alignment = \ glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir, name))[0] new_record = TCSeqRec(alignment) new_record.sequences = [seq[:length] for seq in new_record.sequences] new_record._update() print new_record.seqlength new_record.headers = [replacement_dict[x[:x.rindex('/')]] for x in new_record.headers] # bug should be fixed new_record._update() new_record.sort_by_name() if split_lengths and gene_names: with open('{0}/trees.txt'.format(output_dir), 'a') as trf: trf.write('{0}\t{1}\n'.format('-'.join(gene_names), tree.newick)) for rec in new_record.split_by_lengths(split_lengths, gene_names): rec.write_phylip('{0}/{1}.phy'.format(output_dir, rec.name)) else: with open('{0}/trees.txt'.format(output_dir), 'a') as trf: trf.write('{0}\t{1}\n'.format(new_record.name, tree.newick)) new_record.write_phylip('{0}/{1}.phy'.format(output_dir, name)) shutil.rmtree(param_dir) shutil.rmtree(working_dir)
help=datatype_help) parser.add_argument('-score', dest='score', action='store_true') args = vars(parser.parse_args()) input_dir = args['input'].rstrip('/') output = args['output'] tmpdir = args['tmpdir'].rstrip('/') min_clusters = args['min_clusters'] max_clusters = args['max_clusters'] distance = args['distance'] method = args['cluster_method'] delete = args['delete'] datatype = args['datatype'] score = args['score'] directorycheck_and_quit(input_dir) directorycheck_and_make(tmpdir) gtp_path = os.environ['GTP_PATH'] helper = os.environ['DARWINHELPER'] from sequence_collection import SequenceCollection sc = SequenceCollection( input_dir, file_format='phylip', datatype=datatype, helper=helper, gtp_path=gtp_path, tmpdir=tmpdir, overwrite=True,
def simulate_from_record_WAG( cls, record, output_dir, name='tempsim', tmpdir='/tmp', allow_nonsense=False, split_lengths=None, gene_names=None, ): length = record.seqlength tree = record.tree directorycheck_and_quit(tmpdir) gamma = tree.extract_gamma_parameter() param_dir = '{0}/alf_parameter_dir'.format(tmpdir) working_dir = '{0}/alf_working_dir'.format(tmpdir) directorycheck_and_make(param_dir, verbose=False) directorycheck_and_make(working_dir, verbose=False) treefile = '{0}/treefile.nwk'.format(tmpdir) tree.pam2sps('sps2pam').write_to_file(treefile) directorycheck_and_make(param_dir) directorycheck_and_make(working_dir) sim = cls(simulation_name=name, working_directory=working_dir, outfile_path=param_dir, unit_is_pam=True) sim.indels() sim.rate_variation(gamma) sim.root_genome(number_of_genes=1, min_length=length) sim.one_word_model('WAG') sim.custom_tree(treefile) params = sim.write_parameters() sim.runALF(params, quiet=True) tree_newick = tree.newick alf_newick = \ open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir, name)).read() replacement_dict = dict( zip(re.findall(r'(\w+)(?=:)', alf_newick), re.findall(r'(\w+)(?=:)', tree_newick))) # bug correction alignment = \ glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir, name))[0] new_record = TCSeqRec(alignment) new_record.sequences = [seq[:length] for seq in new_record.sequences] new_record._update() print new_record.seqlength new_record.headers = [ replacement_dict[x[:x.rindex('/')]] for x in new_record.headers ] # bug should be fixed new_record._update() new_record.sort_by_name() if split_lengths and gene_names: with open('{0}/trees.txt'.format(output_dir), 'a') as trf: trf.write('{0}\t{1}\n'.format('-'.join(gene_names), tree.newick)) for rec in new_record.split_by_lengths(split_lengths, gene_names): rec.write_phylip('{0}/{1}.phy'.format(output_dir, rec.name)) else: with open('{0}/trees.txt'.format(output_dir), 'a') as trf: trf.write('{0}\t{1}\n'.format(new_record.name, tree.newick)) new_record.write_phylip('{0}/{1}.phy'.format(output_dir, name)) shutil.rmtree(param_dir) shutil.rmtree(working_dir)