import argparse
import re
import sys
from errors import filecheck_and_quit, directorycheck_and_quit

progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group()
desc = 'Read in a SequenceCollection from disk and print scores'
input_help = 'Filepath+name of gzipped SequenceCollection object'
category_choices = ['Observed', 'Randomised', 'Simulated', 'NA']

parser = argparse.ArgumentParser(prog=progname, description=desc)
parser.add_argument('-i', dest='input_file', help=input_help, type=str)
parser.add_argument('-t', dest='phyml_dir', help=input_help, type=str)

args = parser.parse_args()
input_file = args.input_file
phyml_dir = args.phyml_dir.rstrip('/')

filecheck_and_quit(input_file)
directorycheck_and_quit(phyml_dir)

from sequence_collection import SequenceCollection

sc = SequenceCollection.gunzip(input_file)
cluster_records = sc.get_cluster_records()
sc.load_phyml_results(phyml_dir, records=cluster_records,
                      use_hashname=True)
sc.update_scores()
sc.gzip(input_file)
                    type=str, help=datatype_help)
parser.add_argument('-score', dest='score', action='store_true')

args = vars(parser.parse_args())
input_dir = args['input'].rstrip('/')
output = args['output']
tmpdir = args['tmpdir'].rstrip('/')
min_clusters = args['min_clusters']
max_clusters = args['max_clusters']
distance = args['distance']
method = args['cluster_method']
delete = args['delete']
datatype = args['datatype']
score = args['score']

directorycheck_and_quit(input_dir)
directorycheck_and_make(tmpdir)

gtp_path = os.environ['GTP_PATH']
helper = os.environ['DARWINHELPER']

from sequence_collection import SequenceCollection

sc = SequenceCollection(
    input_dir,
    file_format='phylip',
    datatype=datatype,
    helper=helper,
    gtp_path=gtp_path,
    tmpdir=tmpdir,
    overwrite=True,
################################################################################

import argparse
import re
import sys
from errors import filecheck_and_quit, directorycheck_and_quit

progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group()
desc = 'Read in a SequenceCollection from disk and print scores'
input_help = 'Filepath+name of gzipped SequenceCollection object'
category_choices = ['Observed', 'Randomised', 'Simulated', 'NA']

parser = argparse.ArgumentParser(prog=progname, description=desc)
parser.add_argument('-i', dest='input_file', help=input_help, type=str)
parser.add_argument('-t', dest='phyml_dir', help=input_help, type=str)

args = parser.parse_args()
input_file = args.input_file
phyml_dir = args.phyml_dir.rstrip('/')

filecheck_and_quit(input_file)
directorycheck_and_quit(phyml_dir)

from sequence_collection import SequenceCollection

sc = SequenceCollection.gunzip(input_file)
cluster_records = sc.get_cluster_records()
sc.load_phyml_results(phyml_dir, records=cluster_records, use_hashname=True)
sc.update_scores()
sc.gzip(input_file)
Esempio n. 4
0
    def simulate_from_record_WAG(
        cls,
        record,
        output_dir,
        name='tempsim',
        tmpdir='/tmp',
        allow_nonsense=False,
        split_lengths=None,
        gene_names=None,
        ):

        length = record.seqlength
        tree = record.tree
        directorycheck_and_quit(tmpdir)
        gamma = tree.extract_gamma_parameter()
        param_dir = '{0}/alf_parameter_dir'.format(tmpdir)
        working_dir = '{0}/alf_working_dir'.format(tmpdir)
        directorycheck_and_make(param_dir, verbose=False)
        directorycheck_and_make(working_dir, verbose=False)
        treefile = '{0}/treefile.nwk'.format(tmpdir)

        tree.pam2sps('sps2pam').write_to_file(treefile)

        directorycheck_and_make(param_dir)
        directorycheck_and_make(working_dir)

        sim = cls(simulation_name=name, working_directory=working_dir,
                  outfile_path=param_dir, unit_is_pam=True)

        sim.indels()
        sim.rate_variation(gamma)
        sim.root_genome(number_of_genes=1, min_length=length)
        sim.one_word_model('WAG')
        sim.custom_tree(treefile)
        params = sim.write_parameters()
        sim.runALF(params, quiet=True)
        tree_newick = tree.newick
        alf_newick = \
            open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir,
                 name)).read()
        replacement_dict = dict(zip(re.findall(r'(\w+)(?=:)',
                                alf_newick), re.findall(r'(\w+)(?=:)',
                                tree_newick)))  # bug correction

        alignment = \
            glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir,
                      name))[0]

        new_record = TCSeqRec(alignment)
        new_record.sequences = [seq[:length] for seq in
                                new_record.sequences]
        new_record._update()

        print new_record.seqlength
        new_record.headers = [replacement_dict[x[:x.rindex('/')]]
                              for x in new_record.headers]  # bug should be fixed
        new_record._update()
        new_record.sort_by_name()
        if split_lengths and gene_names:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format('-'.join(gene_names), tree.newick))
            for rec in new_record.split_by_lengths(split_lengths,
                    gene_names):
                rec.write_phylip('{0}/{1}.phy'.format(output_dir,
                                 rec.name))
        else:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format(new_record.name, tree.newick))
            new_record.write_phylip('{0}/{1}.phy'.format(output_dir,
                                    name))
        shutil.rmtree(param_dir)
        shutil.rmtree(working_dir)
                    help=datatype_help)
parser.add_argument('-score', dest='score', action='store_true')

args = vars(parser.parse_args())
input_dir = args['input'].rstrip('/')
output = args['output']
tmpdir = args['tmpdir'].rstrip('/')
min_clusters = args['min_clusters']
max_clusters = args['max_clusters']
distance = args['distance']
method = args['cluster_method']
delete = args['delete']
datatype = args['datatype']
score = args['score']

directorycheck_and_quit(input_dir)
directorycheck_and_make(tmpdir)

gtp_path = os.environ['GTP_PATH']
helper = os.environ['DARWINHELPER']

from sequence_collection import SequenceCollection

sc = SequenceCollection(
    input_dir,
    file_format='phylip',
    datatype=datatype,
    helper=helper,
    gtp_path=gtp_path,
    tmpdir=tmpdir,
    overwrite=True,
Esempio n. 6
0
    def simulate_from_record_WAG(
        cls,
        record,
        output_dir,
        name='tempsim',
        tmpdir='/tmp',
        allow_nonsense=False,
        split_lengths=None,
        gene_names=None,
    ):

        length = record.seqlength
        tree = record.tree
        directorycheck_and_quit(tmpdir)
        gamma = tree.extract_gamma_parameter()
        param_dir = '{0}/alf_parameter_dir'.format(tmpdir)
        working_dir = '{0}/alf_working_dir'.format(tmpdir)
        directorycheck_and_make(param_dir, verbose=False)
        directorycheck_and_make(working_dir, verbose=False)
        treefile = '{0}/treefile.nwk'.format(tmpdir)

        tree.pam2sps('sps2pam').write_to_file(treefile)

        directorycheck_and_make(param_dir)
        directorycheck_and_make(working_dir)

        sim = cls(simulation_name=name,
                  working_directory=working_dir,
                  outfile_path=param_dir,
                  unit_is_pam=True)

        sim.indels()
        sim.rate_variation(gamma)
        sim.root_genome(number_of_genes=1, min_length=length)
        sim.one_word_model('WAG')
        sim.custom_tree(treefile)
        params = sim.write_parameters()
        sim.runALF(params, quiet=True)
        tree_newick = tree.newick
        alf_newick = \
            open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir,
                 name)).read()
        replacement_dict = dict(
            zip(re.findall(r'(\w+)(?=:)', alf_newick),
                re.findall(r'(\w+)(?=:)', tree_newick)))  # bug correction

        alignment = \
            glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir,
                      name))[0]

        new_record = TCSeqRec(alignment)
        new_record.sequences = [seq[:length] for seq in new_record.sequences]
        new_record._update()

        print new_record.seqlength
        new_record.headers = [
            replacement_dict[x[:x.rindex('/')]] for x in new_record.headers
        ]  # bug should be fixed
        new_record._update()
        new_record.sort_by_name()
        if split_lengths and gene_names:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format('-'.join(gene_names),
                                              tree.newick))
            for rec in new_record.split_by_lengths(split_lengths, gene_names):
                rec.write_phylip('{0}/{1}.phy'.format(output_dir, rec.name))
        else:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format(new_record.name, tree.newick))
            new_record.write_phylip('{0}/{1}.phy'.format(output_dir, name))
        shutil.rmtree(param_dir)
        shutil.rmtree(working_dir)