def dump_records( self, output_dir, records=None, file_format='phylip', use_hashname=True, ): """ Dumps all sequence alignment records to an output directory Files are dumped in sequential phylip format; by default the names are hashed """ directorycheck_and_make(output_dir) hash_translation = {} if not records: records = self.get_records() for rec in records: filename = rec._write_temp_phylip(output_dir, use_hashname=use_hashname) try: hash_translation[str(rec.name)] = filename except TypeError: print type(rec.name), rec.name, type(filename), filename cPickle.dump(hash_translation, open('{0}/hash_translation.pkl'.format(output_dir), 'w'))
def dump_records(self, output_dir, records=None, file_format="phylip", use_hashname=True): """ Dumps all sequence alignment records to an output directory Files are dumped in sequential phylip format; by default the names are hashed """ directorycheck_and_make(output_dir) hash_translation = {} if not records: records = self.get_records() for rec in records: filename = rec._write_temp_phylip(output_dir, use_hashname=use_hashname) try: hash_translation[str(rec.name)] = filename except TypeError: print type(rec.name), rec.name, type(filename), filename cPickle.dump(hash_translation, open("{0}/hash_translation.pkl".format(output_dir), "w"))
parser.add_argument('-score', dest='score', action='store_true') args = vars(parser.parse_args()) input_dir = args['input'].rstrip('/') output = args['output'] tmpdir = args['tmpdir'].rstrip('/') min_clusters = args['min_clusters'] max_clusters = args['max_clusters'] distance = args['distance'] method = args['cluster_method'] delete = args['delete'] datatype = args['datatype'] score = args['score'] directorycheck_and_quit(input_dir) directorycheck_and_make(tmpdir) gtp_path = os.environ['GTP_PATH'] helper = os.environ['DARWINHELPER'] from sequence_collection import SequenceCollection sc = SequenceCollection( input_dir, file_format='phylip', datatype=datatype, helper=helper, gtp_path=gtp_path, tmpdir=tmpdir, overwrite=True, )
progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group() desc = 'Read in a SequenceCollection from disk and dump records' input_help = 'Filepath+name of gzipped SequenceCollection object' output_help = 'Directory to dump files in' choice_help = \ '\n'.join(['Choose to dump post-clustering concatenated records', 'instead of pre-clustering single records']) parser = argparse.ArgumentParser(prog=progname, description=desc) parser.add_argument('-i', dest='input_file', help=input_help, type=str) parser.add_argument('-o', dest='output_dir', help=output_help, type=str) parser.add_argument('-c', dest='cluster_recs', action='store_true') args = parser.parse_args() input_file = args.input_file output_dir = args.output_dir.rstrip('/') cluster_recs = args.cluster_recs filecheck_and_quit(input_file) directorycheck_and_make(output_dir) from sequence_collection import SequenceCollection sc = SequenceCollection.gunzip(input_file) if cluster_recs: records = sc.get_cluster_records() sc.dump_records(output_dir, records) else: records = sc.get_records() # should be default anyway, but explicit sc.dump_records(output_dir, records) # is better than implicit, and all that
type=str, choices=valid_methods, default='spectral', ) args = vars(parser.parse_args()) input_dir = args['input'].rstrip('/') tmpdir = args['tmpdir'].rstrip('/') min_clusters = args['min_clusters'] max_clusters = args['max_clusters'] distance = args['distance'] method = args['cluster_method'] pickle = '{0}/scrand.pkl'.format(input_dir) directorycheck_and_raise(input_dir) directorycheck_and_make(tmpdir) filecheck_and_quit(pickle) sc = cPickle.load(open(pickle)) sc.tmpdir = tmpdir print 'Loading phyml results...' sc.load_phyml_results(input_dir, use_hashname=True) print 'Autotuning...' sc.autotune(distance, max_groups=max_clusters) print 'Clustering...' sc.put_partitions(distance, method, range(min_clusters, max_clusters)) sc.concatenate_records() sc.put_cluster_trees(program='bionj') scores = sorted(sc.get_scores(), key=lambda x: x[0]) print 'Scores:' for score in scores:
def simulate_from_record_WAG( cls, record, output_dir, name='tempsim', tmpdir='/tmp', allow_nonsense=False, split_lengths=None, gene_names=None, ): length = record.seqlength tree = record.tree directorycheck_and_quit(tmpdir) gamma = tree.extract_gamma_parameter() param_dir = '{0}/alf_parameter_dir'.format(tmpdir) working_dir = '{0}/alf_working_dir'.format(tmpdir) directorycheck_and_make(param_dir, verbose=False) directorycheck_and_make(working_dir, verbose=False) treefile = '{0}/treefile.nwk'.format(tmpdir) tree.pam2sps('sps2pam').write_to_file(treefile) directorycheck_and_make(param_dir) directorycheck_and_make(working_dir) sim = cls(simulation_name=name, working_directory=working_dir, outfile_path=param_dir, unit_is_pam=True) sim.indels() sim.rate_variation(gamma) sim.root_genome(number_of_genes=1, min_length=length) sim.one_word_model('WAG') sim.custom_tree(treefile) params = sim.write_parameters() sim.runALF(params, quiet=True) tree_newick = tree.newick alf_newick = \ open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir, name)).read() replacement_dict = dict(zip(re.findall(r'(\w+)(?=:)', alf_newick), re.findall(r'(\w+)(?=:)', tree_newick))) # bug correction alignment = \ glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir, name))[0] new_record = TCSeqRec(alignment) new_record.sequences = [seq[:length] for seq in new_record.sequences] new_record._update() print new_record.seqlength new_record.headers = [replacement_dict[x[:x.rindex('/')]] for x in new_record.headers] # bug should be fixed new_record._update() new_record.sort_by_name() if split_lengths and gene_names: with open('{0}/trees.txt'.format(output_dir), 'a') as trf: trf.write('{0}\t{1}\n'.format('-'.join(gene_names), tree.newick)) for rec in new_record.split_by_lengths(split_lengths, gene_names): rec.write_phylip('{0}/{1}.phy'.format(output_dir, rec.name)) else: with open('{0}/trees.txt'.format(output_dir), 'a') as trf: trf.write('{0}\t{1}\n'.format(new_record.name, tree.newick)) new_record.write_phylip('{0}/{1}.phy'.format(output_dir, name)) shutil.rmtree(param_dir) shutil.rmtree(working_dir)
import argparse import re progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group() desc = '\n'.join(['Read a SequenceCollection from pickle,', 'make a randomised copy,', 'dump records']) input_help = 'Path+Filename for the input pickle file' output_help = \ 'Path to output directory. Will be created if doesn\'t exist' parser = argparse.ArgumentParser(prog=progname, description=desc, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-i', '--input', help=input_help, type=str) parser.add_argument('-o', '--output', help=output_help, type=str) args = vars(parser.parse_args()) pickle = args['input'] output_dir = args['output'] filecheck_and_quit(pickle) # can't find file -> quit directorycheck_and_make(output_dir) # can't find directory -> create it sc = cPickle.load(file(pickle)) scrand = sc.make_randomised_copy() scrand.dump_records(output_dir) cPickle.dump(scrand, open('{0}/scrand.pkl'.format(output_dir), 'w'))
default='spectral', ) args = parser.parse_args() input_file = args.input_file output_dir = args.output_dir tmpdir = args.tmpdir min_clusters = args.min_clusters max_clusters = args.max_clusters distance = args.distance method = args.cluster_method ind = args.ind tree_method = args.tree_method filecheck_and_quit(input_file) directorycheck_and_make(output_dir) directorycheck_and_make(tmpdir) ################################################################################ # Main ################################################################################ from sequence_collection import SequenceCollection sc = SequenceCollection.gunzip(input_file) for c in range(min_clusters, max_clusters): try: assert (distance, method, c) in sc.clusters_to_partitions except AssertionError: print c sys.exit()
from errors import filecheck_and_quit, directorycheck_and_make import cPickle import sys import argparse import re progname = re.compile("[A-Za-z0-9.-_]+").search(sys.argv[0]).group() desc = "\n".join(["Read a SequenceCollection from pickle,", "make a randomised copy,", "dump records"]) input_help = "Path+Filename for the input pickle file" output_help = "Path to output directory. Will be created if doesn't exist" parser = argparse.ArgumentParser(prog=progname, description=desc, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("-i", "--input", help=input_help, type=str) parser.add_argument("-o", "--output", help=output_help, type=str) args = vars(parser.parse_args()) pickle = args["input"] output_dir = args["output"] filecheck_and_quit(pickle) # can't find file -> quit directorycheck_and_make(output_dir) # can't find directory -> create it sc = cPickle.load(file(pickle)) scrand = sc.make_randomised_copy() scrand.dump_records(output_dir) cPickle.dump(scrand, open("{0}/scrand.pkl".format(output_dir), "w"))
def simulate_from_record_WAG( cls, record, output_dir, name='tempsim', tmpdir='/tmp', allow_nonsense=False, split_lengths=None, gene_names=None, ): length = record.seqlength tree = record.tree directorycheck_and_quit(tmpdir) gamma = tree.extract_gamma_parameter() param_dir = '{0}/alf_parameter_dir'.format(tmpdir) working_dir = '{0}/alf_working_dir'.format(tmpdir) directorycheck_and_make(param_dir, verbose=False) directorycheck_and_make(working_dir, verbose=False) treefile = '{0}/treefile.nwk'.format(tmpdir) tree.pam2sps('sps2pam').write_to_file(treefile) directorycheck_and_make(param_dir) directorycheck_and_make(working_dir) sim = cls(simulation_name=name, working_directory=working_dir, outfile_path=param_dir, unit_is_pam=True) sim.indels() sim.rate_variation(gamma) sim.root_genome(number_of_genes=1, min_length=length) sim.one_word_model('WAG') sim.custom_tree(treefile) params = sim.write_parameters() sim.runALF(params, quiet=True) tree_newick = tree.newick alf_newick = \ open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir, name)).read() replacement_dict = dict( zip(re.findall(r'(\w+)(?=:)', alf_newick), re.findall(r'(\w+)(?=:)', tree_newick))) # bug correction alignment = \ glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir, name))[0] new_record = TCSeqRec(alignment) new_record.sequences = [seq[:length] for seq in new_record.sequences] new_record._update() print new_record.seqlength new_record.headers = [ replacement_dict[x[:x.rindex('/')]] for x in new_record.headers ] # bug should be fixed new_record._update() new_record.sort_by_name() if split_lengths and gene_names: with open('{0}/trees.txt'.format(output_dir), 'a') as trf: trf.write('{0}\t{1}\n'.format('-'.join(gene_names), tree.newick)) for rec in new_record.split_by_lengths(split_lengths, gene_names): rec.write_phylip('{0}/{1}.phy'.format(output_dir, rec.name)) else: with open('{0}/trees.txt'.format(output_dir), 'a') as trf: trf.write('{0}\t{1}\n'.format(new_record.name, tree.newick)) new_record.write_phylip('{0}/{1}.phy'.format(output_dir, name)) shutil.rmtree(param_dir) shutil.rmtree(working_dir)