def put_records(
        self,
        files=None,
        record_list=None,
        file_format='fasta',
        datatype='protein',
    ):
        """
        Reads sequence files from the list generated by
        get_files and stores in self.records
        """

        get_name = lambda i: i[i.rindex('/') + 1:i.rindex('.')]

        if files and not record_list:
            record_list = [
                TCSeqRec(f,
                         file_format=file_format,
                         name=get_name(f),
                         datatype=datatype) for f in files
            ]
        elif not files and not record_list:

            print 'Can\'t load records - no records or alignment files given'
            return

        records_to_keys = dict([(record.name, number)
                                for (number, record) in enumerate(record_list)
                                ])
        keys_to_records = dict(enumerate(record_list))
        self.records = record_list
        self.length = len(record_list)
        self.records_to_keys = records_to_keys
        self.keys_to_records = keys_to_records
Example #2
0
    def simulate_set(
        self,
        K,
        M,
        n,
        tune,
        regime,
        branch_length_func,
        inner_edge_params,
        leaf_params,
        scale_func,
        mk=None,
        master_tree_generator_method='random_topology',
        class_tree_permuter='nni',
        guarantee_unique=False,
        num_permutations=0,
        scale_params=(1, 1),
        gene_length_kappa=1,
        gene_length_theta=1,
        gene_length_min=10,
        filepath='./',
        tmpdir='/tmp',
        gtp_path='./class_files',
        unit_is_pam=True,
        quiet=True,
        ):
        """
        Regime 1:
            1 topology (n species)
            M alignments
            (2n - 3) branch lengths in total

        Regime 2:
            K topologies (n species)
            M alignments, distributed among K classes
            K * (2n - 3) branch lengths in total

        Regime 3:
            K topologies (n species)
            M alignments, distributed among K classes
            Each of Mk alignments in class k has scaled branch lengths 
            (Mk - 1) * (2n - 3) branch lengths in total

        Regime 4:
            K topologies (n species)
            M alignments, distributed among K classes
            Each of Mk alignments in class k has independent branch lengths
            M * K * (2n - 3) branch lengths in total

        Tuning:
        The tuning parameter gives coarse control over the difference in sizes
        of groups - for example a very large value ( > 1000 ) tends to
        assign M - K + 1 genes to a single group, and 1 gene to each of the
        others, and a very small value ( < 1/1000 ) tends to assign M/K genes
        to each class. A zero value makes all groups the same size.

        """

        print '{0} = {1}'.format(class_tree_permuter, num_permutations)

        def class_stats(M, mk_list):
            d = {}
            nclasses = len(mk_list)
            Msize = len(M)
            ind = np.triu_indices(Msize, 1)
            intra_class = []
            inter_class = []
            cs = np.concatenate((np.array([0]), np.cumsum(mk_list)))
            for i in range(nclasses):
                intra_class += list(M[cs[i]:cs[i + 1], cs[i]:cs[i
                                    + 1]][np.triu_indices(mk_list[i],
                                    1)].flatten())
                inter_class += list(M[cs[i]:cs[i + 1], cs[i + 1]:
                                    ].flatten())
            d['overall_mean'] = np.mean(M[ind])
            d['intra_mean'] = np.mean(intra_class)
            d['inter_mean'] = np.mean(inter_class)
            d['overall_var'] = np.var(M[ind])
            d['intra_var'] = np.var(intra_class)
            d['inter_var'] = np.var(inter_class)

            return d

        def make_master_tree(
            n,
            method,
            names=None,
            inner_edge_params=(1, 1),
            leaf_params=(1, 1),
            distribution_func=np.random.gamma,
            ):
            """
            Function returns a tree object with n tips,
            named according to `names`, and constructed
            according to `method`, which is one of 'random_topology',
            'random_yule' and 'random_coal'
            """

            if method == 'random_topology':
                master_topology = Tree.new_random_topology(n,
                        names=names, rooted=True)
                master_tree = \
                    master_topology.randomise_branch_lengths(inner_edges=inner_edge_params,
                        leaves=leaf_params,
                        distribution_func=branch_length_func)
                master_tree.newick = '[&R] ' + master_tree.newick
            elif method == 'random_yule':
                master_tree = Tree.new_random_yule(n, names=names)
            elif method == 'random_coal':
                master_tree = Tree.new_random_coal(n, names=names)
            return master_tree

        def make_class_tree(
            master_tree,
            permutation_extent,
            method,
            with_check=True,
            checklist=[],
            ):
            """
            Function returns a tree object derived from a master tree,
            but with some permutation applied. The type of permutation
            is defined by `method`, one of 'nni', 'spr' and 'coal'
            If with_check is True, the generated tree is checked against
            a checklist of other trees on the same species, and permutations
            are applied until the new tree has a unique topology. This is 
            only implemented for nni and spr.
            """

            if num_permutations == 0:
                return master_tree

            new_tree = Tree(master_tree.newick)

            if method == 'nni':
                if with_check:
                    while not self.check_diff_top(new_tree, checklist):
                        new_tree = Tree(master_tree.newick)
                        for i in range(permutation_extent):
                            new_tree = new_tree.nni()
                else:
                    for i in range(num_permutations):
                        new_tree = new_tree.nni()
            elif method == 'spr':
                if with_check:
                    while not self.check_diff_top(new_tree, checklist):
                        new_tree = Tree(master_tree.newick)
                        for i in range(permutation_extent):
                            new_tree = \
                                new_tree.spr(disallow_sibling_SPRs=True)
                else:
                    for i in range(num_permutations):
                        new_tree = new_tree.spr()
            elif method == 'coal':
                new_tree = \
                    master_tree.get_constrained_gene_tree(scale_to=permutation_extent)
            return new_tree

        # Create directories for simulation trees and parameter files

        if not os.path.isdir('{0}/alf_parameter_dir'.format(tmpdir)):
            os.mkdir('{0}/alf_parameter_dir'.format(tmpdir))
        if not os.path.isdir('{0}/alf_trees_dir'.format(tmpdir)):
            os.mkdir('{0}/alf_trees_dir'.format(tmpdir))
        if not os.path.isdir(filepath):
            os.mkdir(filepath)
        if not os.path.isdir('{0}/true_trees'.format(filepath)):
            os.mkdir('{0}/true_trees'.format(filepath))
        if not os.path.isdir('{0}/true_trees/individual'.format(filepath)):
            os.mkdir('{0}/true_trees/individual'.format(filepath))
        if not os.path.isdir('{0}/dna_alignments'.format(filepath)):
            os.mkdir('{0}/dna_alignments'.format(filepath))
        if not os.path.isdir('{0}/aa_alignments'.format(filepath)):
            os.mkdir('{0}/aa_alignments'.format(filepath))

        # Assign numbers of genes to classes
        # list `mk` gives number of genes in each class

        if regime == 1:
            K = 1

        if tune is not None and mk is None:
            if tune == 0:
                proportions = [float(K) / M for x in range(K)]
            else:

                proportions = np.random.gamma(shape=float(M) / (tune * K),
                        scale=tune * float(K) / M, size=K)

            s = sum(proportions)
            mk = [int((np.round(x * M / s) if x * M / s > 0.5 else 1.0))
                  for x in proportions]
            diff = M - sum(mk)
            if diff > 0:
                mk[mk.index(min(mk))] += diff
            else:
                mk[mk.index(max(mk))] += diff
            assert min(mk) > 0.0

        else:
            assert sum(mk) == M

        true_clustering = []
        for i in range(K):
            for j in range(mk[i]):
                true_clustering.append(i + 1)

        print 'Simulating {0} genes in {1} classes, distributed as {2}'.format(M,
                K, mk)

        # names = ['Sp{0}'.format(i) for i in range(1, n + 1)]

        print 'N classes =', K
        print 'N genes = ', M
        print 'N species =', n
        print 'Regime = ', regime
        print 'N permutations =', num_permutations
        print 'Tuning =', tune
        print 'mk =', mk
        print 'true clustering = ', true_clustering

        # Create simulation trees

        # Make a master tree

        master_tree = make_master_tree(
            n,
            method=master_tree_generator_method,
            inner_edge_params=inner_edge_params,
            leaf_params=leaf_params,
            distribution_func=branch_length_func,
            )
        class_trees = []
        parameter_files = []

        print 'Master tree = ', master_tree
        master_tree.write_to_file('{0}/true_trees/master.tree'.format(filepath),
                                  suppress_NHX=True)

        # make K class trees

        for k in range(K):
            print 'Making class {0}/{1}'.format(k + 1, K)

            if num_permutations > 0:
                class_tree = make_class_tree(master_tree,
                        num_permutations, class_tree_permuter,
                        with_check=guarantee_unique,
                        checklist=class_trees)
                class_trees.append(class_tree)
            else:

                class_tree = make_master_tree(
                    n,
                    method=master_tree_generator_method,
                    inner_edge_params=inner_edge_params,
                    leaf_params=leaf_params,
                    distribution_func=branch_length_func,
                    )
                class_trees.append(class_tree)

            print 'class tree = ', class_tree
            class_tree.write_to_file('{0}/true_trees/class{1}.tree'.format(filepath,
                    k + 1), suppress_NHX=True)

            # ALF only behaves itself if trees are in PAM units,
            # so we scale our newly-generated class trees to have branch lengths
            # in PAM units.
            # Our class_trees list contains unconverted trees

            if unit_is_pam:  # Default = True
                class_tree_PAM = class_tree.pam2sps('sps2pam')  # conversion from SPS to PAM
                class_tree_PAM.write_to_file('{0}/alf_trees_dir/class{1}_1.nwk'.format(tmpdir,
                        k + 1), suppress_NHX=True)

            # Write parameter files

            ngenes = mk[k]

            sim = SeqSim(simulation_name='class{0}_1'.format(k + 1),
                         working_directory='{0}/alf_working_dir'.format(tmpdir),
                         outfile_path='{0}/alf_parameter_dir'.format(tmpdir),
                         unit_is_pam=unit_is_pam)  # make new simulation object
            sim.parameters['subst'] = self.parameters['subst']  # copy over global parameters
            sim.parameters['indels'] = self.parameters['indels']
            sim.parameters['ratevar'] = self.parameters['ratevar']

            if regime in [1, 2]:

                sim.root_genome(number_of_genes=ngenes,
                                kappa=gene_length_kappa,
                                theta=gene_length_theta)
                sim.custom_tree('{0}/alf_trees_dir/class{1}_1.nwk'.format(tmpdir,
                                k + 1))
                sim.write_parameters()

                continue

            # For regimes 3 & 4 each gene within a class is simulated along its own tree:
            #   Under regime 3 each gene within a class has its branch lengths scaled
            #   by a scaling parameter, and under regime 4 each gene has random branch lengths
            #   drawn from some distribution (although individual branch rescaling might be better)

            for genes in range(ngenes):
                if regime == 3:
                    scale_factor = scale_func(*scale_params)
                    class_tree = class_trees[k].scale(scale_factor)
                elif regime == 4:
                    class_tree = \
                        class_trees[k].randomise_branch_lengths(inner_edges=inner_edge_params,
                            leaves=leaf_params,
                            distribution_func=branch_length_func)
                    if unit_is_pam:  # same conversion as before
                        class_tree = class_tree.pam2sps('sps2pam')

                class_tree.write_to_file('{0}/alf_trees_dir/class{1}_{2}.nwk'.format(tmpdir,
                        k + 1, genes + 1), suppress_NHX=True)

                sim.root_genome(number_of_genes=1,
                                min_length=gene_length_min,
                                kappa=gene_length_kappa,
                                theta=gene_length_theta)
                sim.custom_tree('{0}/alf_trees_dir/class{1}_{2}.nwk'.format(tmpdir,
                                k + 1, genes + 1))
                sim.rename('class{0}_{1}'.format(k + 1, genes + 1))
                sim.write_parameters()

        # Estimate distances between base class trees

        # if unit_is_pam:
        #     class_trees = [x.pam2sps() for x in class_trees]

        geodists = []
        eucdists = []
        symdists = []
        wrfdists = []
        with open('{0}/basetrees.nwk'.format(tmpdir), 'w') as file:
            file.write('\n'.join([x.newick.rstrip() for x in
                       class_trees]))
        os.system('java -jar {0}/gtp.jar -o {1}/baseout.txt {1}/basetrees.nwk'.format(gtp_path,
                  tmpdir))
        with open('{0}/baseout.txt'.format(tmpdir)) as file:
            for line in file:
                line = line.rstrip()
                if line:
                    (i, j, value) = line.split()
                    geodists.append(float(value))
        for a in range(K):
            tree_a = dpy.Tree.get_from_string(class_trees[a].newick,
                    'newick')
            for b in range(a + 1, K):
                tree_b = \
                    dpy.Tree.get_from_string(class_trees[b].newick,
                        'newick')
                eucdists.append(tree_a.euclidean_distance(tree_b))
                symdists.append(tree_a.symmetric_difference(tree_b))
                wrfdists.append(tree_a.robinson_foulds_distance(tree_b))

        writer = open('{0}/treedistances.txt'.format(filepath), 'w')
        writer.write('''True clustering:\t{0}
Class base tree distances:
geodesic\t{1}
euclidean\t{2}
RF\t{3}
wRF\t{4}

'''.format(true_clustering,
                     np.mean(geodists), np.mean(eucdists),
                     np.mean(symdists), np.mean(wrfdists)))
        writer.flush()

        # Run simulations, and correct ALF renaming bug

        parameter_files = \
            glob.glob('{0}/alf_parameter_dir/*.drw'.format(tmpdir))
        tree_files = glob.glob('{0}/alf_trees_dir/*.nwk'.format(tmpdir))
        sort_key = lambda item: tuple((int(num) if num else alpha)
                for (num, alpha) in re.findall(r'(\d+)|(\D+)', item))
        parameter_files.sort(key=sort_key)
        tree_files.sort(key=sort_key)
        files = zip(parameter_files, tree_files)

        for (params, tree) in files:
            self.runALF(params, quiet=quiet)
            name = params[params.rindex('/'):params.rindex('.')]
            (class_number, base_gene_number) = re.findall(r'(\d+)',
                    name)
            tree_newick = open(tree).read()
            alf_newick = \
                open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir,
                     name)).read()
            replacement_dict = dict(zip(re.findall(r'(\w+)(?=:)',
                                    alf_newick),
                                    re.findall(r'(\w+)(?=:)',
                                    tree_newick)))  # bug correction

            for dna_alignment in \
                sorted(glob.glob('{0}/alf_working_dir/{1}/MSA/*dna.fa'.format(tmpdir,
                       name)), key=sort_key):
                gene_number = dna_alignment[dna_alignment.rindex('/')
                    + 1:].split('_')[1]
                record = TCSeqRec(dna_alignment)
                record.sort_by_name()
                record.headers = [replacement_dict[x[:x.rindex('/')]]
                                  for x in record.headers]
                record.write_fasta('{0}/dna_alignments/class{1}_{2}.fas'.format(filepath,
                                   class_number, int(base_gene_number)
                                   + int(gene_number) - 1))
                record.write_phylip('{0}/dna_alignments/class{1}_{2}.phy'.format(filepath,
                                    class_number, int(base_gene_number)
                                    + int(gene_number) - 1))

            for aa_alignment in \
                sorted(glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir,
                       name)), key=sort_key):
                gene_number = aa_alignment[aa_alignment.rindex('/')
                    + 1:].split('_')[1]
                record = TCSeqRec(aa_alignment)
                record.sort_by_name()
                record.headers = [replacement_dict[x[:x.rindex('/')]]
                                  for x in record.headers]
                record.write_fasta('{0}/aa_alignments/class{1}_{2}.fas'.format(filepath,
                                   class_number, int(base_gene_number)
                                   + int(gene_number) - 1))
                record.write_phylip('{0}/aa_alignments/class{1}_{2}.phy'.format(filepath,
                                    class_number, int(base_gene_number)
                                    + int(gene_number) - 1))

            # Write true trees

            if regime in [1, 2]:
                for g in range(mk[int(class_number) - 1]):
                    Tree(tree_newick).pam2sps().write_to_file('{0}/true_trees/individual/class{1}_{2}.nwk'.format(filepath,
                            class_number, g + 1), suppress_NHX=True)
            else:

                Tree(tree_newick).pam2sps().write_to_file('{0}/true_trees/individual/{1}.nwk'.format(filepath,
                        name), suppress_NHX=True)

        # Intra- and inter-class stats

        alltrees = \
            glob.glob('{0}/true_trees/individual/*.nwk'.format(filepath))

        alltrees.sort(key=sort_key)

        alltrees = [open(x).read().rstrip() for x in alltrees]

        dpytrees = [dpy.Tree.get_from_string(x, 'newick') for x in
                    alltrees]

        # for x in range(len(alltrees)):
        #     print x,'\n',alltrees[x], '\n',dpy.Tree.get_from_string(alltrees[x],'newick').as_newick_string()

        geodists = np.zeros([M, M])
        eucdists = np.zeros([M, M])
        symdists = np.zeros([M, M])
        wrfdists = np.zeros([M, M])

        # using gtp.jar for geodesic distances

        with open('{0}/geotrees.nwk'.format(tmpdir), 'w') as file:
            file.write('\n'.join(alltrees))
        os.system('java -jar {0}/gtp.jar -o {1}/output.txt {1}/geotrees.nwk'.format(gtp_path,
                  tmpdir))
        with open('{0}/output.txt'.format(tmpdir)) as file:
            for line in file:
                line = line.rstrip()
                if line:
                    (i, j, value) = line.split()
                    i = int(i)
                    j = int(j)
                    value = float(value)
                    geodists[i, j] = geodists[j, i] = value

        for a in range(M):
            for b in range(a + 1, M):
                eucdists[a, b] = eucdists[b, a] = \
                    dpytrees[a].euclidean_distance(dpytrees[b])
                symdists[a, b] = symdists[b, a] = \
                    dpytrees[a].symmetric_difference(dpytrees[b])
                wrfdists[a, b] = wrfdists[b, a] = \
                    dpytrees[a].robinson_foulds_distance(dpytrees[b])

        geodic = class_stats(geodists, mk)
        eucdic = class_stats(eucdists, mk)
        symdic = class_stats(symdists, mk)
        wrfdic = class_stats(wrfdists, mk)

        writer.write('Geodesic class stats\n')
        for key in sorted(geodic):
            writer.write('{0}\t{1}\n'.format(key, geodic[key]))
        writer.write('\n')
        writer.flush()

        writer.write('Euc class stats\n')
        for key in sorted(eucdic):
            writer.write('{0}\t{1}\n'.format(key, eucdic[key]))
        writer.write('\n')
        writer.flush()

        writer.write('RF class stats\n')
        for key in sorted(symdic):
            writer.write('{0}\t{1}\n'.format(key, symdic[key]))
        writer.write('\n')
        writer.flush()

        writer.write('wRF class stats\n')
        for key in sorted(wrfdic):
            writer.write('{0}\t{1}\n'.format(key, wrfdic[key]))
        writer.write('\n')
        writer.flush()

        writer.close()

        shutil.rmtree('{0}/alf_parameter_dir'.format(tmpdir))
        shutil.rmtree('{0}/alf_trees_dir'.format(tmpdir))
        shutil.rmtree('{0}/alf_working_dir'.format(tmpdir))
        os.remove('{0}/output.txt'.format(tmpdir))
        os.remove('{0}/geotrees.nwk'.format(tmpdir))
        os.remove('{0}/basetrees.nwk'.format(tmpdir))
        os.remove('{0}/baseout.txt'.format(tmpdir))
Example #3
0
    def simulate_from_record_WAG(
        cls,
        record,
        output_dir,
        name='tempsim',
        tmpdir='/tmp',
        allow_nonsense=False,
        split_lengths=None,
        gene_names=None,
        ):

        length = record.seqlength
        tree = record.tree
        directorycheck_and_quit(tmpdir)
        gamma = tree.extract_gamma_parameter()
        param_dir = '{0}/alf_parameter_dir'.format(tmpdir)
        working_dir = '{0}/alf_working_dir'.format(tmpdir)
        directorycheck_and_make(param_dir, verbose=False)
        directorycheck_and_make(working_dir, verbose=False)
        treefile = '{0}/treefile.nwk'.format(tmpdir)

        tree.pam2sps('sps2pam').write_to_file(treefile)

        directorycheck_and_make(param_dir)
        directorycheck_and_make(working_dir)

        sim = cls(simulation_name=name, working_directory=working_dir,
                  outfile_path=param_dir, unit_is_pam=True)

        sim.indels()
        sim.rate_variation(gamma)
        sim.root_genome(number_of_genes=1, min_length=length)
        sim.one_word_model('WAG')
        sim.custom_tree(treefile)
        params = sim.write_parameters()
        sim.runALF(params, quiet=True)
        tree_newick = tree.newick
        alf_newick = \
            open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir,
                 name)).read()
        replacement_dict = dict(zip(re.findall(r'(\w+)(?=:)',
                                alf_newick), re.findall(r'(\w+)(?=:)',
                                tree_newick)))  # bug correction

        alignment = \
            glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir,
                      name))[0]

        new_record = TCSeqRec(alignment)
        new_record.sequences = [seq[:length] for seq in
                                new_record.sequences]
        new_record._update()

        print new_record.seqlength
        new_record.headers = [replacement_dict[x[:x.rindex('/')]]
                              for x in new_record.headers]  # bug should be fixed
        new_record._update()
        new_record.sort_by_name()
        if split_lengths and gene_names:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format('-'.join(gene_names), tree.newick))
            for rec in new_record.split_by_lengths(split_lengths,
                    gene_names):
                rec.write_phylip('{0}/{1}.phy'.format(output_dir,
                                 rec.name))
        else:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format(new_record.name, tree.newick))
            new_record.write_phylip('{0}/{1}.phy'.format(output_dir,
                                    name))
        shutil.rmtree(param_dir)
        shutil.rmtree(working_dir)
Example #4
0
parser.add_argument('-f', '--infile', help='input file', type=fpath, default='.')
parser.add_argument('-m', '--model', help='model', type=str, default='GTR')
parser.add_argument('-n', '--ncat', help='number of categories of gamma-distributed rate variation', type=int, default=4)
parser.add_argument('-d', '--datatype', help='datatype = nt (nucleotide), or aa (amino acid)', type=str, default='nt')
args = vars(parser.parse_args())

infile = args['infile'] + os.environ['LSB_JOBINDEX']
model = args['model']
datatype = args['datatype']
ncat = args['ncat']

if not os.path.isfile(infile):
    print 'Input file not found:\n{0}'.format(os.path.abspath(infile))
    sys.exit(0)

with open(infile) as file:
    target = os.path.abspath(file.read())

if not os.path.isfile(target):
    print 'Target file not found:\n{0}'.format(os.path.abspath(target))
    sys.exit(0)

print target
parent_dir = os.path.dirname(target)
name = getname(target)

record = TCSeqRec(target, file_format='phylip', name=name)
record.get_phyml_tree(model=model,ncat=ncat,datatype=datatype)

cPickle.dump(record, open('{0}/{1}.pickle'.format(parent_dir, name),'w'))
except:
    tmpdir = '/tmp'

infile = args['infile']
if index:
    infile += index

if not infile[-1].isdigit():
    print '{0} is not correct'.format(infile)
    sys.exit(1)

if not os.path.isfile(infile):
    print 'Input file not found:\n{0}'.format(os.path.abspath(infile))
    sys.exit(0)

with open(infile) as file:
    target = os.path.abspath(file.read())

if not os.path.isfile(target):
    print 'Target file not found:\n{0}'.format(os.path.abspath(target))
    sys.exit(2)

print target
parent_dir = os.path.dirname(target)
name = getname(target)

record = TCSeqRec(target, file_format='phylip', name=name)
if not os.path.isfile('{0}/{1}.ml.pickle'.format(parent_dir, name)):
    record.get_phyml_tree(model='GTR',ncat=4,datatype='nt', tmpdir=tmpdir)
    cPickle.dump(record, open('{0}/{1}.ml.pickle'.format(parent_dir, name),'w')) # In future let's just pickle trees; sequences already stored on disk
                    default='.')
parser.add_argument('-p',
                    '--phylip_dir',
                    help='Subpath of simdir in which to find the phylip files',
                    type=fpath)
args = vars(parser.parse_args())
index = os.environ['LSB_JOBINDEX']
indir = os.path.abspath(args['directory'])
if index and index != '0':
    indir += index
print 'Working on {0}'.format(indir)
phylip_dir = args['phylip_dir']
try:
    tmpdir = os.environ['TEMPORARY_DIRECTORY']
except:
    tmpdir = '/tmp'

working_dir = '/'.join([indir, phylip_dir])
phylip_files = sorted(glob.glob('{0}/*.phy'.format(working_dir)), key=sort_key)
print 'Working on {0}'.format(working_dir)

for f in phylip_files:
    name = getname(f)
    print 'Getting BIONJ tree for {0}'.format(name)
    seqrec = TCSeqRec(f, file_format='phylip', name=name)
    seqrec.datatype = 'dna'
    seqrec.get_bionj_tree(model='GTR', ncat=4, datatype='nt', tmpdir=tmpdir)
    cPickle.dump(
        seqrec, open('{0}/{1}.nj.pickle'.format(working_dir, name), 'w')
    )  # In future let's just pickle trees; sequences already stored on disk
sort_key = lambda item: tuple((int(num) if num else alpha) for (num,alpha) in re.findall(r'(\d+)|(\D+)', item))
getname = lambda x: x[x.rindex('/')+1:x.rindex('.')]

parser = argparse.ArgumentParser(prog='doclustering.py')
parser.add_argument('-d', '--directory', help='input directory', type=fpath, default='.')
parser.add_argument('-p', '--phylip_dir', help='Subpath of simdir in which to find the phylip files', type=fpath)
args = vars(parser.parse_args())
index = os.environ['LSB_JOBINDEX']
indir = os.path.abspath(args['directory'])
if index and index != '0':
    indir += index
print 'Working on {0}'.format(indir)
phylip_dir = args['phylip_dir']
try:
    tmpdir = os.environ['TEMPORARY_DIRECTORY']
except:
    tmpdir = '/tmp'

working_dir = '/'.join([indir, phylip_dir])
phylip_files = sorted(glob.glob('{0}/*.phy'.format(working_dir)), key=sort_key)
print 'Working on {0}'.format(working_dir)

for f in phylip_files:
    name = getname(f)
    print 'Getting BIONJ tree for {0}'.format(name)
    seqrec = TCSeqRec(f, file_format='phylip', name=name)
    seqrec.datatype='dna'
    seqrec.get_bionj_tree(model='GTR', ncat=4, datatype='nt', tmpdir=tmpdir)
    cPickle.dump(seqrec, open('{0}/{1}.nj.pickle'.format(working_dir, name),'w'))# In future let's just pickle trees; sequences already stored on disk
infile = args['infile']
if index:
    infile += index

if not infile[-1].isdigit():
    print '{0} is not correct'.format(infile)
    sys.exit(1)

if not os.path.isfile(infile):
    print 'Input file not found:\n{0}'.format(os.path.abspath(infile))
    sys.exit(0)

with open(infile) as file:
    target = os.path.abspath(file.read())

if not os.path.isfile(target):
    print 'Target file not found:\n{0}'.format(os.path.abspath(target))
    sys.exit(2)

print target
parent_dir = os.path.dirname(target)
name = getname(target)

record = TCSeqRec(target, file_format='phylip', name=name)
if not os.path.isfile('{0}/{1}.ml.pickle'.format(parent_dir, name)):
    record.get_phyml_tree(model='GTR', ncat=4, datatype='nt', tmpdir=tmpdir)
    cPickle.dump(
        record, open('{0}/{1}.ml.pickle'.format(parent_dir, name), 'w')
    )  # In future let's just pickle trees; sequences already stored on disk
    name = get_name(dv)
    if not os.path.isfile('{0}/trees/{1}.nwk'.format(working_dir,
                          name)):
        tree = get_best_TC_tree(dv, gm, labels_file, tree_files, name)
        print tree
        tree.write_to_file('{0}/trees/{1}.nwk'.format(working_dir,
                           name), metadata=True)
    else:
        tree = Tree()
        tree.read_from_file('{0}/trees/{1}.nwk'.format(working_dir,
                            name))

    dv_matrix_strip_header = '\n'.join(dv_matrix.split('\n'
            )[2:]).rstrip()
    labels_strip_header = labels.split('\n')[1].rstrip()
    record = TCSeqRec()
    record.dv = [(dv_matrix_strip_header, labels_strip_header)]
    record.tree = tree
    record.name = name
    record.headers = labels_strip_header.split()
    record.sequences = ['' for _ in record.headers]
    record._update()
    records.append(record)

collection = SequenceCollection(records=records, get_distances=False,
                                gtp_path=os.environ['GTP_PATH'])
collection.put_distance_matrices('rf')
T = \
    collection.Clustering.run_spectral_rotate(collection.distance_matrices['rf'
        ])
collection.partitions[T] = Partition(T)
Example #10
0
                    '--datatype',
                    help='datatype = nt (nucleotide), or aa (amino acid)',
                    type=str,
                    default='nt')
args = vars(parser.parse_args())

infile = args['infile'] + os.environ['LSB_JOBINDEX']
model = args['model']
datatype = args['datatype']
ncat = args['ncat']

if not os.path.isfile(infile):
    print 'Input file not found:\n{0}'.format(os.path.abspath(infile))
    sys.exit(0)

with open(infile) as file:
    target = os.path.abspath(file.read())

if not os.path.isfile(target):
    print 'Target file not found:\n{0}'.format(os.path.abspath(target))
    sys.exit(0)

print target
parent_dir = os.path.dirname(target)
name = getname(target)

record = TCSeqRec(target, file_format='phylip', name=name)
record.get_phyml_tree(model=model, ncat=ncat, datatype=datatype)

cPickle.dump(record, open('{0}/{1}.pickle'.format(parent_dir, name), 'w'))
Example #11
0
    def simulate_set(
        self,
        K,
        M,
        n,
        tune,
        regime,
        branch_length_func,
        inner_edge_params,
        leaf_params,
        scale_func,
        mk=None,
        master_tree_generator_method='random_topology',
        class_tree_permuter='nni',
        guarantee_unique=False,
        num_permutations=0,
        scale_params=(1, 1),
        gene_length_kappa=1,
        gene_length_theta=1,
        gene_length_min=10,
        filepath='./',
        tmpdir='/tmp',
        gtp_path='./class_files',
        unit_is_pam=True,
        quiet=True,
    ):
        """
        Regime 1:
            1 topology (n species)
            M alignments
            (2n - 3) branch lengths in total

        Regime 2:
            K topologies (n species)
            M alignments, distributed among K classes
            K * (2n - 3) branch lengths in total

        Regime 3:
            K topologies (n species)
            M alignments, distributed among K classes
            Each of Mk alignments in class k has scaled branch lengths 
            (Mk - 1) * (2n - 3) branch lengths in total

        Regime 4:
            K topologies (n species)
            M alignments, distributed among K classes
            Each of Mk alignments in class k has independent branch lengths
            M * K * (2n - 3) branch lengths in total

        Tuning:
        The tuning parameter gives coarse control over the difference in sizes
        of groups - for example a very large value ( > 1000 ) tends to
        assign M - K + 1 genes to a single group, and 1 gene to each of the
        others, and a very small value ( < 1/1000 ) tends to assign M/K genes
        to each class. A zero value makes all groups the same size.

        """

        print '{0} = {1}'.format(class_tree_permuter, num_permutations)

        def class_stats(M, mk_list):
            d = {}
            nclasses = len(mk_list)
            Msize = len(M)
            ind = np.triu_indices(Msize, 1)
            intra_class = []
            inter_class = []
            cs = np.concatenate((np.array([0]), np.cumsum(mk_list)))
            for i in range(nclasses):
                intra_class += list(M[cs[i]:cs[i + 1],
                                      cs[i]:cs[i + 1]][np.triu_indices(
                                          mk_list[i], 1)].flatten())
                inter_class += list(M[cs[i]:cs[i + 1], cs[i + 1]:].flatten())
            d['overall_mean'] = np.mean(M[ind])
            d['intra_mean'] = np.mean(intra_class)
            d['inter_mean'] = np.mean(inter_class)
            d['overall_var'] = np.var(M[ind])
            d['intra_var'] = np.var(intra_class)
            d['inter_var'] = np.var(inter_class)

            return d

        def make_master_tree(
                n,
                method,
                names=None,
                inner_edge_params=(1, 1),
                leaf_params=(1, 1),
                distribution_func=np.random.gamma,
        ):
            """
            Function returns a tree object with n tips,
            named according to `names`, and constructed
            according to `method`, which is one of 'random_topology',
            'random_yule' and 'random_coal'
            """

            if method == 'random_topology':
                master_topology = Tree.new_random_topology(n,
                                                           names=names,
                                                           rooted=True)
                master_tree = \
                    master_topology.randomise_branch_lengths(inner_edges=inner_edge_params,
                        leaves=leaf_params,
                        distribution_func=branch_length_func)
                master_tree.newick = '[&R] ' + master_tree.newick
            elif method == 'random_yule':
                master_tree = Tree.new_random_yule(n, names=names)
            elif method == 'random_coal':
                master_tree = Tree.new_random_coal(n, names=names)
            return master_tree

        def make_class_tree(
            master_tree,
            permutation_extent,
            method,
            with_check=True,
            checklist=[],
        ):
            """
            Function returns a tree object derived from a master tree,
            but with some permutation applied. The type of permutation
            is defined by `method`, one of 'nni', 'spr' and 'coal'
            If with_check is True, the generated tree is checked against
            a checklist of other trees on the same species, and permutations
            are applied until the new tree has a unique topology. This is 
            only implemented for nni and spr.
            """

            if num_permutations == 0:
                return master_tree

            new_tree = Tree(master_tree.newick)

            if method == 'nni':
                if with_check:
                    while not self.check_diff_top(new_tree, checklist):
                        new_tree = Tree(master_tree.newick)
                        for i in range(permutation_extent):
                            new_tree = new_tree.nni()
                else:
                    for i in range(num_permutations):
                        new_tree = new_tree.nni()
            elif method == 'spr':
                if with_check:
                    while not self.check_diff_top(new_tree, checklist):
                        new_tree = Tree(master_tree.newick)
                        for i in range(permutation_extent):
                            new_tree = \
                                new_tree.spr(disallow_sibling_SPRs=True)
                else:
                    for i in range(num_permutations):
                        new_tree = new_tree.spr()
            elif method == 'coal':
                new_tree = \
                    master_tree.get_constrained_gene_tree(scale_to=permutation_extent)
            return new_tree

        # Create directories for simulation trees and parameter files

        if not os.path.isdir('{0}/alf_parameter_dir'.format(tmpdir)):
            os.mkdir('{0}/alf_parameter_dir'.format(tmpdir))
        if not os.path.isdir('{0}/alf_trees_dir'.format(tmpdir)):
            os.mkdir('{0}/alf_trees_dir'.format(tmpdir))
        if not os.path.isdir(filepath):
            os.mkdir(filepath)
        if not os.path.isdir('{0}/true_trees'.format(filepath)):
            os.mkdir('{0}/true_trees'.format(filepath))
        if not os.path.isdir('{0}/true_trees/individual'.format(filepath)):
            os.mkdir('{0}/true_trees/individual'.format(filepath))
        if not os.path.isdir('{0}/dna_alignments'.format(filepath)):
            os.mkdir('{0}/dna_alignments'.format(filepath))
        if not os.path.isdir('{0}/aa_alignments'.format(filepath)):
            os.mkdir('{0}/aa_alignments'.format(filepath))

        # Assign numbers of genes to classes
        # list `mk` gives number of genes in each class

        if regime == 1:
            K = 1

        if tune is not None and mk is None:
            if tune == 0:
                proportions = [float(K) / M for x in range(K)]
            else:

                proportions = np.random.gamma(shape=float(M) / (tune * K),
                                              scale=tune * float(K) / M,
                                              size=K)

            s = sum(proportions)
            mk = [
                int((np.round(x * M / s) if x * M / s > 0.5 else 1.0))
                for x in proportions
            ]
            diff = M - sum(mk)
            if diff > 0:
                mk[mk.index(min(mk))] += diff
            else:
                mk[mk.index(max(mk))] += diff
            assert min(mk) > 0.0

        else:
            assert sum(mk) == M

        true_clustering = []
        for i in range(K):
            for j in range(mk[i]):
                true_clustering.append(i + 1)

        print 'Simulating {0} genes in {1} classes, distributed as {2}'.format(
            M, K, mk)

        # names = ['Sp{0}'.format(i) for i in range(1, n + 1)]

        print 'N classes =', K
        print 'N genes = ', M
        print 'N species =', n
        print 'Regime = ', regime
        print 'N permutations =', num_permutations
        print 'Tuning =', tune
        print 'mk =', mk
        print 'true clustering = ', true_clustering

        # Create simulation trees

        # Make a master tree

        master_tree = make_master_tree(
            n,
            method=master_tree_generator_method,
            inner_edge_params=inner_edge_params,
            leaf_params=leaf_params,
            distribution_func=branch_length_func,
        )
        class_trees = []
        parameter_files = []

        print 'Master tree = ', master_tree
        master_tree.write_to_file(
            '{0}/true_trees/master.tree'.format(filepath), suppress_NHX=True)

        # make K class trees

        for k in range(K):
            print 'Making class {0}/{1}'.format(k + 1, K)

            if num_permutations > 0:
                class_tree = make_class_tree(master_tree,
                                             num_permutations,
                                             class_tree_permuter,
                                             with_check=guarantee_unique,
                                             checklist=class_trees)
                class_trees.append(class_tree)
            else:

                class_tree = make_master_tree(
                    n,
                    method=master_tree_generator_method,
                    inner_edge_params=inner_edge_params,
                    leaf_params=leaf_params,
                    distribution_func=branch_length_func,
                )
                class_trees.append(class_tree)

            print 'class tree = ', class_tree
            class_tree.write_to_file('{0}/true_trees/class{1}.tree'.format(
                filepath, k + 1),
                                     suppress_NHX=True)

            # ALF only behaves itself if trees are in PAM units,
            # so we scale our newly-generated class trees to have branch lengths
            # in PAM units.
            # Our class_trees list contains unconverted trees

            if unit_is_pam:  # Default = True
                class_tree_PAM = class_tree.pam2sps(
                    'sps2pam')  # conversion from SPS to PAM
                class_tree_PAM.write_to_file(
                    '{0}/alf_trees_dir/class{1}_1.nwk'.format(tmpdir, k + 1),
                    suppress_NHX=True)

            # Write parameter files

            ngenes = mk[k]

            sim = SeqSim(
                simulation_name='class{0}_1'.format(k + 1),
                working_directory='{0}/alf_working_dir'.format(tmpdir),
                outfile_path='{0}/alf_parameter_dir'.format(tmpdir),
                unit_is_pam=unit_is_pam)  # make new simulation object
            sim.parameters['subst'] = self.parameters[
                'subst']  # copy over global parameters
            sim.parameters['indels'] = self.parameters['indels']
            sim.parameters['ratevar'] = self.parameters['ratevar']

            if regime in [1, 2]:

                sim.root_genome(number_of_genes=ngenes,
                                kappa=gene_length_kappa,
                                theta=gene_length_theta)
                sim.custom_tree('{0}/alf_trees_dir/class{1}_1.nwk'.format(
                    tmpdir, k + 1))
                sim.write_parameters()

                continue

            # For regimes 3 & 4 each gene within a class is simulated along its own tree:
            #   Under regime 3 each gene within a class has its branch lengths scaled
            #   by a scaling parameter, and under regime 4 each gene has random branch lengths
            #   drawn from some distribution (although individual branch rescaling might be better)

            for genes in range(ngenes):
                if regime == 3:
                    scale_factor = scale_func(*scale_params)
                    class_tree = class_trees[k].scale(scale_factor)
                elif regime == 4:
                    class_tree = \
                        class_trees[k].randomise_branch_lengths(inner_edges=inner_edge_params,
                            leaves=leaf_params,
                            distribution_func=branch_length_func)
                    if unit_is_pam:  # same conversion as before
                        class_tree = class_tree.pam2sps('sps2pam')

                class_tree.write_to_file(
                    '{0}/alf_trees_dir/class{1}_{2}.nwk'.format(
                        tmpdir, k + 1, genes + 1),
                    suppress_NHX=True)

                sim.root_genome(number_of_genes=1,
                                min_length=gene_length_min,
                                kappa=gene_length_kappa,
                                theta=gene_length_theta)
                sim.custom_tree('{0}/alf_trees_dir/class{1}_{2}.nwk'.format(
                    tmpdir, k + 1, genes + 1))
                sim.rename('class{0}_{1}'.format(k + 1, genes + 1))
                sim.write_parameters()

        # Estimate distances between base class trees

        # if unit_is_pam:
        #     class_trees = [x.pam2sps() for x in class_trees]

        geodists = []
        eucdists = []
        symdists = []
        wrfdists = []
        with open('{0}/basetrees.nwk'.format(tmpdir), 'w') as file:
            file.write('\n'.join([x.newick.rstrip() for x in class_trees]))
        os.system('java -jar {0}/gtp.jar -o {1}/baseout.txt {1}/basetrees.nwk'.
                  format(gtp_path, tmpdir))
        with open('{0}/baseout.txt'.format(tmpdir)) as file:
            for line in file:
                line = line.rstrip()
                if line:
                    (i, j, value) = line.split()
                    geodists.append(float(value))
        for a in range(K):
            tree_a = dpy.Tree.get_from_string(class_trees[a].newick, 'newick')
            for b in range(a + 1, K):
                tree_b = \
                    dpy.Tree.get_from_string(class_trees[b].newick,
                        'newick')
                eucdists.append(tree_a.euclidean_distance(tree_b))
                symdists.append(tree_a.symmetric_difference(tree_b))
                wrfdists.append(tree_a.robinson_foulds_distance(tree_b))

        writer = open('{0}/treedistances.txt'.format(filepath), 'w')
        writer.write('''True clustering:\t{0}
Class base tree distances:
geodesic\t{1}
euclidean\t{2}
RF\t{3}
wRF\t{4}

'''.format(true_clustering, np.mean(geodists), np.mean(eucdists),
           np.mean(symdists), np.mean(wrfdists)))
        writer.flush()

        # Run simulations, and correct ALF renaming bug

        parameter_files = \
            glob.glob('{0}/alf_parameter_dir/*.drw'.format(tmpdir))
        tree_files = glob.glob('{0}/alf_trees_dir/*.nwk'.format(tmpdir))
        sort_key = lambda item: tuple(
            (int(num) if num else alpha)
            for (num, alpha) in re.findall(r'(\d+)|(\D+)', item))
        parameter_files.sort(key=sort_key)
        tree_files.sort(key=sort_key)
        files = zip(parameter_files, tree_files)

        for (params, tree) in files:
            self.runALF(params, quiet=quiet)
            name = params[params.rindex('/'):params.rindex('.')]
            (class_number, base_gene_number) = re.findall(r'(\d+)', name)
            tree_newick = open(tree).read()
            alf_newick = \
                open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir,
                     name)).read()
            replacement_dict = dict(
                zip(re.findall(r'(\w+)(?=:)', alf_newick),
                    re.findall(r'(\w+)(?=:)', tree_newick)))  # bug correction

            for dna_alignment in \
                sorted(glob.glob('{0}/alf_working_dir/{1}/MSA/*dna.fa'.format(tmpdir,
                       name)), key=sort_key):
                gene_number = dna_alignment[dna_alignment.rindex('/') +
                                            1:].split('_')[1]
                record = TCSeqRec(dna_alignment)
                record.sort_by_name()
                record.headers = [
                    replacement_dict[x[:x.rindex('/')]] for x in record.headers
                ]
                record.write_fasta(
                    '{0}/dna_alignments/class{1}_{2}.fas'.format(
                        filepath, class_number,
                        int(base_gene_number) + int(gene_number) - 1))
                record.write_phylip(
                    '{0}/dna_alignments/class{1}_{2}.phy'.format(
                        filepath, class_number,
                        int(base_gene_number) + int(gene_number) - 1))

            for aa_alignment in \
                sorted(glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir,
                       name)), key=sort_key):
                gene_number = aa_alignment[aa_alignment.rindex('/') +
                                           1:].split('_')[1]
                record = TCSeqRec(aa_alignment)
                record.sort_by_name()
                record.headers = [
                    replacement_dict[x[:x.rindex('/')]] for x in record.headers
                ]
                record.write_fasta('{0}/aa_alignments/class{1}_{2}.fas'.format(
                    filepath, class_number,
                    int(base_gene_number) + int(gene_number) - 1))
                record.write_phylip(
                    '{0}/aa_alignments/class{1}_{2}.phy'.format(
                        filepath, class_number,
                        int(base_gene_number) + int(gene_number) - 1))

            # Write true trees

            if regime in [1, 2]:
                for g in range(mk[int(class_number) - 1]):
                    Tree(tree_newick).pam2sps().write_to_file(
                        '{0}/true_trees/individual/class{1}_{2}.nwk'.format(
                            filepath, class_number, g + 1),
                        suppress_NHX=True)
            else:

                Tree(tree_newick).pam2sps().write_to_file(
                    '{0}/true_trees/individual/{1}.nwk'.format(filepath, name),
                    suppress_NHX=True)

        # Intra- and inter-class stats

        alltrees = \
            glob.glob('{0}/true_trees/individual/*.nwk'.format(filepath))

        alltrees.sort(key=sort_key)

        alltrees = [open(x).read().rstrip() for x in alltrees]

        dpytrees = [dpy.Tree.get_from_string(x, 'newick') for x in alltrees]

        # for x in range(len(alltrees)):
        #     print x,'\n',alltrees[x], '\n',dpy.Tree.get_from_string(alltrees[x],'newick').as_newick_string()

        geodists = np.zeros([M, M])
        eucdists = np.zeros([M, M])
        symdists = np.zeros([M, M])
        wrfdists = np.zeros([M, M])

        # using gtp.jar for geodesic distances

        with open('{0}/geotrees.nwk'.format(tmpdir), 'w') as file:
            file.write('\n'.join(alltrees))
        os.system(
            'java -jar {0}/gtp.jar -o {1}/output.txt {1}/geotrees.nwk'.format(
                gtp_path, tmpdir))
        with open('{0}/output.txt'.format(tmpdir)) as file:
            for line in file:
                line = line.rstrip()
                if line:
                    (i, j, value) = line.split()
                    i = int(i)
                    j = int(j)
                    value = float(value)
                    geodists[i, j] = geodists[j, i] = value

        for a in range(M):
            for b in range(a + 1, M):
                eucdists[a, b] = eucdists[b, a] = \
                    dpytrees[a].euclidean_distance(dpytrees[b])
                symdists[a, b] = symdists[b, a] = \
                    dpytrees[a].symmetric_difference(dpytrees[b])
                wrfdists[a, b] = wrfdists[b, a] = \
                    dpytrees[a].robinson_foulds_distance(dpytrees[b])

        geodic = class_stats(geodists, mk)
        eucdic = class_stats(eucdists, mk)
        symdic = class_stats(symdists, mk)
        wrfdic = class_stats(wrfdists, mk)

        writer.write('Geodesic class stats\n')
        for key in sorted(geodic):
            writer.write('{0}\t{1}\n'.format(key, geodic[key]))
        writer.write('\n')
        writer.flush()

        writer.write('Euc class stats\n')
        for key in sorted(eucdic):
            writer.write('{0}\t{1}\n'.format(key, eucdic[key]))
        writer.write('\n')
        writer.flush()

        writer.write('RF class stats\n')
        for key in sorted(symdic):
            writer.write('{0}\t{1}\n'.format(key, symdic[key]))
        writer.write('\n')
        writer.flush()

        writer.write('wRF class stats\n')
        for key in sorted(wrfdic):
            writer.write('{0}\t{1}\n'.format(key, wrfdic[key]))
        writer.write('\n')
        writer.flush()

        writer.close()

        shutil.rmtree('{0}/alf_parameter_dir'.format(tmpdir))
        shutil.rmtree('{0}/alf_trees_dir'.format(tmpdir))
        shutil.rmtree('{0}/alf_working_dir'.format(tmpdir))
        os.remove('{0}/output.txt'.format(tmpdir))
        os.remove('{0}/geotrees.nwk'.format(tmpdir))
        os.remove('{0}/basetrees.nwk'.format(tmpdir))
        os.remove('{0}/baseout.txt'.format(tmpdir))
Example #12
0
    def simulate_from_record_WAG(
        cls,
        record,
        output_dir,
        name='tempsim',
        tmpdir='/tmp',
        allow_nonsense=False,
        split_lengths=None,
        gene_names=None,
    ):

        length = record.seqlength
        tree = record.tree
        directorycheck_and_quit(tmpdir)
        gamma = tree.extract_gamma_parameter()
        param_dir = '{0}/alf_parameter_dir'.format(tmpdir)
        working_dir = '{0}/alf_working_dir'.format(tmpdir)
        directorycheck_and_make(param_dir, verbose=False)
        directorycheck_and_make(working_dir, verbose=False)
        treefile = '{0}/treefile.nwk'.format(tmpdir)

        tree.pam2sps('sps2pam').write_to_file(treefile)

        directorycheck_and_make(param_dir)
        directorycheck_and_make(working_dir)

        sim = cls(simulation_name=name,
                  working_directory=working_dir,
                  outfile_path=param_dir,
                  unit_is_pam=True)

        sim.indels()
        sim.rate_variation(gamma)
        sim.root_genome(number_of_genes=1, min_length=length)
        sim.one_word_model('WAG')
        sim.custom_tree(treefile)
        params = sim.write_parameters()
        sim.runALF(params, quiet=True)
        tree_newick = tree.newick
        alf_newick = \
            open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir,
                 name)).read()
        replacement_dict = dict(
            zip(re.findall(r'(\w+)(?=:)', alf_newick),
                re.findall(r'(\w+)(?=:)', tree_newick)))  # bug correction

        alignment = \
            glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir,
                      name))[0]

        new_record = TCSeqRec(alignment)
        new_record.sequences = [seq[:length] for seq in new_record.sequences]
        new_record._update()

        print new_record.seqlength
        new_record.headers = [
            replacement_dict[x[:x.rindex('/')]] for x in new_record.headers
        ]  # bug should be fixed
        new_record._update()
        new_record.sort_by_name()
        if split_lengths and gene_names:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format('-'.join(gene_names),
                                              tree.newick))
            for rec in new_record.split_by_lengths(split_lengths, gene_names):
                rec.write_phylip('{0}/{1}.phy'.format(output_dir, rec.name))
        else:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format(new_record.name, tree.newick))
            new_record.write_phylip('{0}/{1}.phy'.format(output_dir, name))
        shutil.rmtree(param_dir)
        shutil.rmtree(working_dir)