Python Clustering.spectral Examples

Programming Language: Python

Namespace/Package Name: clustering

Class/Type: Clustering

Method/Function: spectral

Examples at hotexamples.com: 3

Python Clustering.spectral - 3 examples found. These are the top rated real world Python examples of clustering.Clustering.spectral extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Clustering(30)

fit(6)

k_means(4)

createDistanceMatrix(3)

load(3)

buildTree(3)

fill_clusters(3)

run(3)

cluster(3)

createLevelMatrix(3)

lab_to_labels(2)

merge(2)

min_link(2)

mwk_means(2)

imwk_means(2)

ik_means(2)

start(2)

spectral(2)

melhor_da_geracao(1)

inputType(1)

kMeans(1)

thrEstimation(1)

kmeans(1)

kmeans_clustering(1)

kmeans_fit(1)

list_cluster(1)

spectral_decomp(1)

mds_decomp(1)

preProcess(1)

predict(1)

selecao(1)

run_spectral_rotate(1)

run_clustering(1)

mutacao(1)

run_SKMeans_137(1)

mwpam(1)

nn_centers(1)

proclus(1)

plot(1)

print_clusters(1)

pp_distance(1)

pam(1)

get_double_centre(1)

get_label_mapping(1)

dbscan_clustering(1)

MDS_decomp(1)

addnext(1)

agnes_clustering(1)

ahc_fit(1)

avgTree(1)

Example #1

Show file

File: sequence_collection.py Project: kgori/clustering_project

class SequenceCollection(object):

    """
    Orchestrating class that should:
    a) work as a central repository for the information generated by the
       subordinate classes, and
    b) be the only class directly interacted with by the user

    TO DO:
    implement consistent naming of methods (where appropriate)
    Prefixes:
    get_[something]  - returns the object implied by something
    put_[something]  - puts something in the class data structure
    show_[something] - prints something to screen
    plot_[something] - displays a plot of something
    _[something]     - private method
    """

    def __init__(
        self,
        input_dir=None,
        records=None,
        file_format="fasta",
        datatype="protein",
        helper="./class_files/DV_wrapper.drw",
        tmpdir="/tmp",
        get_distances=False,
        parallel_load=False,
        overwrite=True,
    ):

        # Unset Variables

        # Store some mappings for data retrieval

        self.records_to_keys = {}
        self.keys_to_records = {}
        self.clusters_to_partitions = {}
        self.partitions = {}
        self.distance_matrices = {}
        self.concats = {}
        self.inferred_trees = {}
        self.Clustering = Clustering()

        # Store some data

        self.files = None
        self.file_format = file_format
        self.datatype = datatype
        self.records = []
        self.length = 0
        self.helper = helper

        # Set Variables

        self.tmpdir = tmpdir

        # Lambda for sorting by name and number

        sort_key = lambda item: tuple((int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item))

        # Can give an input directory as optional argument
        # If given:
        #    read the alignment files
        #    optionally calculate pairwise distances
        #    store the sequence data

        if input_dir:

            files = self.get_files(input_dir, file_format)

            # file checks

            if files == 0:
                print "!!!"
                print "There was a problem reading files from {0}".format(input_dir)
                print "!!!"
                sys.exit()

            if get_distances and not os.path.isfile(helper):
                print "!!!"
                print "There was a problem finding the darwin helper at {0}".format(helper)
                print "!!!"
                sys.exit()

            # done

            files.sort(key=sort_key)
            self.put_records(files=files, record_list=None, file_format=file_format, datatype=datatype)

            # takes care of self.length for us

            self.sanitise_records()
            if not os.path.isdir(tmpdir):
                os.mkdir(tmpdir)
        elif records:

            # Can optionally give record objects directly if no input dir specified

            self.put_records(files=None, record_list=records, file_format=file_format, datatype=datatype)

            # takes care of self.length for us

            self.sanitise_records()

        # Optionally use Darwin to calculate pairwise distances

        if get_distances and self.records:
            if parallel_load:
                self.put_dv_matrices_parallel(helper=helper, tmpdir=tmpdir, overwrite=overwrite)
            else:
                self.put_dv_matrices(helper=helper, tmpdir=tmpdir, overwrite=overwrite)

    def __str__(self):
        s = "SequenceCollection object:\n"
        s += "Contains {0} alignments\n".format(self.length)
        return s

    def __len__(self):
        return self.length

    def get_files(self, input_dir, file_format="fasta"):
        """
        Get list of alignment files from an input directory
        *.fa, *.fas and *.phy files only
        Stores in self.files
        """

        if file_format == "fasta":
            files = glob.glob("{0}/*.fa".format(input_dir))
            if len(files) == 0:
                files = glob.glob("{0}/*.fas".format(input_dir))
        elif file_format == "phylip":
            files = glob.glob("{0}/*.phy".format(input_dir))
        else:
            print "Unrecognised file format %s" % file_format
            files = None
        if not files:
            print "No sequence files found in {0}".format(input_dir)
            return 0
        return sorted(files)

    def dump_records(self, output_dir, records=None, file_format="phylip", use_hashname=True):
        """
        Dumps all sequence alignment records to an output directory
        Files are dumped in sequential phylip format; by default the
        names are hashed
        """

        directorycheck_and_make(output_dir)

        hash_translation = {}

        if not records:
            records = self.get_records()

        for rec in records:
            filename = rec._write_temp_phylip(output_dir, use_hashname=use_hashname)
            try:
                hash_translation[str(rec.name)] = filename
            except TypeError:
                print type(rec.name), rec.name, type(filename), filename
        cPickle.dump(hash_translation, open("{0}/hash_translation.pkl".format(output_dir), "w"))

    def hash(self, string):
        H = hashlib.sha1(string)
        return H.hexdigest()

    def gzip(self, filename):

        if not filename.endswith(".gz"):
            filename += ".gz"

        cPickle.dump(self, file=gz.open(filename, "wb"), protocol=-1)

    @classmethod
    def gunzip(cls, filename):

        return cPickle.load(gz.open(filename, "rb"))

    def put_records(self, files=None, record_list=None, file_format="fasta", datatype="protein"):
        """
        Reads sequence files from the list generated by
        get_files and stores in self.records
        """

        get_name = lambda i: i[i.rindex("/") + 1 : i.rindex(".")]

        if files and not record_list:
            record_list = [TCSeqRec(f, file_format=file_format, name=get_name(f), datatype=datatype) for f in files]
        elif not files and not record_list:

            print "Can't load records - no records or alignment files given"
            return

        records_to_keys = dict([(record.name, number) for (number, record) in enumerate(record_list)])
        keys_to_records = dict(enumerate(record_list))
        self.records = record_list
        self.length = len(record_list)
        self.records_to_keys = records_to_keys
        self.keys_to_records = keys_to_records

    def load_phyml_results(self, input_dir, records=None, use_hashname=False, program="phyml"):

        if not records:
            records = self.get_records()
        failures = []
        for rec in records:
            if use_hashname:
                name = rec.hashname()
            else:
                name = rec.name
            tree_file = "{0}/{1}.phy_phyml_tree.txt".format(input_dir, name)
            stats_file = "{0}/{1}.phy_phyml_stats.txt".format(input_dir, name)

            try:
                rec.tree.load_phyml_results(tree_file, stats_file, name=rec.name, program=program)
            except FileError:
                failures.append(rec.name)

        if failures:
            print "Couldn't load results for the following records:"
            for f in failures:
                print "   ", f

    def sanitise_records(self):
        """
        Sorts records alphabetically, trims whitespace from beginning
        of record headers, removes '/' characters from headers,
        replaces spaces with underscores, puts sequences into upper case
        """

        for rec in self.get_records():
            rec.sanitise()

    def put_dv_matrices(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True):

        for rec in self.get_records():
            rec.dv = [rec.get_dv_matrix(tmpdir=tmpdir, helper=helper, overwrite=overwrite)]

    def put_trees(
        self,
        rec_list=None,
        program="treecollection",
        model=None,
        datatype=None,
        ncat=4,
        optimise="n",
        tmpdir=None,
        overwrite=True,
        verbose=False,
    ):

        if tmpdir is None:
            tmpdir = self.tmpdir
        if not program in ["treecollection", "raxml", "phyml", "bionj"]:
            print "unrecognised program {0}".format(program)
            return
        if not rec_list:
            rec_list = self.records
        for rec in rec_list:
            if overwrite is False:
                if rec.name in self.inferred_trees:
                    continue
            if program == "treecollection":
                tree = rec.get_TC_tree(tmpdir=tmpdir, overwrite=overwrite)
            elif program == "raxml":
                tree = rec.get_raxml_tree(tmpdir=tmpdir, overwrite=overwrite)
            elif program == "phyml":
                tree = rec.get_phyml_tree(
                    model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite, verbose=verbose
                )
            elif program == "bionj":
                tree = rec.get_bionj_tree(
                    model=model,
                    datatype=datatype,
                    tmpdir=tmpdir,
                    ncat=ncat,
                    optimise=optimise,
                    overwrite=overwrite,
                    verbose=verbose,
                )
            self.inferred_trees[rec.name] = tree

    def put_distance_matrices(self, metrics, tmpdir="/tmp", normalise=False):
        """
        Pass this function a list of metrics
        valid kwargs - invert (bool), normalise (bool)
        """

        if not isinstance(metrics, list):
            metrics = [metrics]
        trees = [rec.tree for rec in self.get_records()]
        for metric in metrics:
            dm = DistanceMatrix(trees, tmpdir=tmpdir)
            dm.get_distance_matrix(metric, normalise=normalise)
            self.distance_matrices[metric] = dm

    def put_partition(self, metric, cluster_method, nclusters, prune=True, tmpdir=None, recalculate=False):

        if not tmpdir:
            tmpdir = self.tmpdir
        if not metric in self.get_distance_matrices():
            self.put_distance_matrices(metric, tmpdir=tmpdir)
        partition_vector = self.Clustering.run_clustering(
            self.distance_matrices[metric], cluster_method, nclusters, prune=prune, recalculate=recalculate
        )

        self.clusters_to_partitions[(metric, cluster_method, nclusters)] = partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)
        return partition_vector

    def put_partition_vector(self, partition_vector, name):
        """
        Given a partition vector (i.e. a tuple containing the class-
        membership for each gene alignment), inserts the relevant data
        structures into the SequenceCollection object.
        NEXT: run concatenate_records(), put_cluster_trees()
        """

        self.clusters_to_partitions[name] = partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)

    def put_partitions(self, metrics, cluster_methods, nclusters, prune=True, tmpdir=None, recalculate=False):
        """
        metrics, linkages and nclasses are given as lists, or coerced into
        lists
        """

        if not isinstance(metrics, list):
            metrics = [metrics]
        if not isinstance(cluster_methods, list):
            cluster_methods = [cluster_methods]
        if not isinstance(nclusters, list):
            nclusters = [nclusters]
        if tmpdir is None:
            tmpdir = self.tmpdir
        else:
            nclusters = sorted(nclusters, reverse=True)

        # names = [rec.name for rec in self.get_records()]

        for metric in metrics:
            print "Clustering {0} data".format(metric)
            self.Clustering.clear_cache()
            for cluster_method in cluster_methods:
                print " ", cluster_method
                for n in nclusters:
                    key = (metric, cluster_method, n)
                    if key in self.clusters_to_partitions:
                        continue
                    else:
                        self.put_partition(
                            metric, cluster_method, n, prune=prune, tmpdir=tmpdir, recalculate=recalculate
                        )

    def concatenate_records(self):
        for p in self.partitions.values():
            p.concatenate_records(self.keys_to_records)
            for concat in p.concats:
                if not concat[0].name in self.concats:
                    self.concats[concat[0].name] = concat

    def autotune(
        self,
        metric,
        prune=True,
        KMeans=True,
        recalculate=True,
        tmpdir=None,
        max_groups=None,
        min_groups=2,
        check_single=True,
    ):
        """
        Uses Perona and Zelnick-Manor's spectral rotation method to determine
        the number of clusters present in the data
        """

        if not tmpdir:
            tmpdir = self.tmpdir
        if not metric in self.get_distance_matrices():
            self.put_distance_matrices(metric, tmpdir=tmpdir)
        dm = self.get_distance_matrices()[metric]

        if check_single and min_groups > 1:
            print "Checking for single cluster..."
            (partition_vector, nclusters, quality_scores) = self.Clustering.run_spectral_rotate(
                dm, prune=prune, KMeans=KMeans, recalculate=recalculate, max_groups=6, min_groups=1, verbose=False
            )
            if nclusters == 1:
                print "Single cluster found."
                print "Quality Scores: {0}".format(quality_scores)

                self.clusters_to_partitions[(metric, "rotate", nclusters)] = partition_vector
                self.partitions[partition_vector] = Partition(partition_vector)
                return (partition_vector, quality_scores)
            else:
                print ">1 clusters found."
                print "Quality Scores: {0}".format(quality_scores)
                recalculate = False

        (partition_vector, nclusters, quality_scores) = self.Clustering.run_spectral_rotate(
            dm, prune=prune, KMeans=KMeans, recalculate=recalculate, max_groups=max_groups, min_groups=min_groups
        )

        self.clusters_to_partitions[(metric, "rotate", nclusters)] = partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)
        return (partition_vector, quality_scores)

    def put_cluster_trees(
        self,
        program="treecollection",
        model=None,
        datatype=None,
        ncat=4,
        optimise="n",
        tmpdir="/tmp",
        overwrite=True,
        max_guide_trees=True,
    ):

        if program not in ["treecollection", "raxml", "phyml", "bionj"]:
            print "unrecognised program {0}".format(program)
            return
        if program == "treecollection":
            return self._put_best_TC_trees(tmpdir=tmpdir, overwrite=overwrite, max_guide_trees=max_guide_trees)
        rec_list = self.get_cluster_records()
        print "Inferring {0} cluster trees".format(len(rec_list))
        self.put_trees(
            rec_list=rec_list,
            program=program,
            model=model,
            ncat=ncat,
            optimise=optimise,
            datatype=datatype,
            tmpdir=tmpdir,
            overwrite=overwrite,
        )
        self.update_scores()

    def _put_best_TC_trees(self, tmpdir="/tmp", overwrite=True, max_guide_trees=-1):
        rec_list = self.get_cluster_records_with_memberships()
        for (rec, members) in rec_list:
            print "Calculating treecollection tree for {0}".format(rec.name),
            if rec.name in self.inferred_trees and overwrite == False:
                print "Skipping - already calculated (overwrite set to False)"
                continue
            guidetrees = [self.keys_to_records[member].tree for member in members]
            if max_guide_trees > 0:
                guidetrees = guidetrees[:max_guide_trees]
            TCtrees = []
            pref = rec._write_temp_tc(make_guide_tree=False, tmpdir=tmpdir)
            pref = "{0}/{1}".format(tmpdir, pref)
            dv_file = pref + "_dv.txt"
            labels_file = pref + "_labels.txt"
            map_file = pref + "_map.txt"
            if len(guidetrees) > 1:
                print "(using best of {0} guidetrees)".format(len(guidetrees))
            else:
                print "(using single guidetree)"
            for t in guidetrees:
                guidetree_file = "{0}/{1}.nwk".format(tmpdir, t.name)
                n = t.reroot_newick()
                with open(guidetree_file, "w") as writer:
                    writer.write(n)
                TCtrees.append(Tree.new_treecollection_tree(dv_file, map_file, labels_file, guidetree_file, rec.name))
            best = min(TCtrees, key=lambda x: x.score)
            rec.tree = best
            self.inferred_trees[rec.name] = best
        self.update_scores()

    def update_scores(self):
        for partition in self.partitions.values():
            partition.update_score(self.concats)

    def _pivot(lst):
        new_lst = zip(*lst)
        return ["".join(x) for x in new_lst]

    def concatenate_list_of_records(self, records=None):
        if not records:
            records = self.get_records()
        concat = copy.deepcopy(records[0])
        for rec in records[1:]:
            concat += rec
        return concat

    def make_randomised_copy(self, tmpdir=None, get_distances=False, parallel_load=False, overwrite=True):

        shuffled_records = self.get_randomised_alignments()
        if not tmpdir:
            tmpdir = self.tmpdir
        randomised_copy = SequenceCollection(
            input_dir=None,
            records=shuffled_records,
            file_format=self.file_format,
            datatype=self.datatype,
            helper=self.helper,
            tmpdir=tmpdir,
            get_distances=get_distances,
            parallel_load=parallel_load,
            overwrite=overwrite,
        )
        return randomised_copy

    def show_memberships(self):

        partitions = self.get_partitions()
        for compound_key in partitions:
            print " ".join(str(x) for x in compound_key)
            partition = partitions[compound_key]
            print partition
            print self.clustering.get_memberships(partition)

    def simulate_from_record(
        self, record, output_dir, name, tmpdir, datatype=None, allow_nonsense=False, split_lengths=None, gene_names=None
    ):

        if not datatype:
            datatype = self.datatype
        if datatype == "protein":
            SeqSim.simulate_from_record_WAG(record, output_dir, name, tmpdir, allow_nonsense, split_lengths, gene_names)
        elif datatype == "dna":
            SeqSim.simulate_from_record_GTR(record, output_dir, name, tmpdir, allow_nonsense, split_lengths, gene_names)
        else:
            print "datatype {0} is not recognised".format(datatype)

    def simulate_from_result(self, key, output_dir, name, tmpdir, datatype=None, allow_nonsense=False):

        if not datatype:
            datatype = self.datatype
        p = self.get_partition(key)
        for c in p.concats:
            updated_record = self.concats[c.name][0]  # bug: records in Partition

            # objects aren't linked
            # to trees

            members = c.name.split("-")
            lengths = [self.keys_to_records[int(x)].seqlength for x in members]
            names = ["sim" + self.keys_to_records[int(x)].name for x in members]
            self.simulate_from_record(
                updated_record,
                output_dir,
                name=name,
                tmpdir=tmpdir,
                allow_nonsense=allow_nonsense,
                split_lengths=lengths,
                gene_names=names,
            )

    #######################
    # Getters
    #######################

    def get_trees(self):
        return [rec.tree for rec in self.get_records()]

    def get_cluster_records(self):
        """
        Returns all concatenated records from cluster analysis
        """

        sort_key = lambda item: tuple(
            (int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item[0].name)
        )
        return [rec for (rec, _) in sorted(self.concats.values(), key=sort_key)]

    def get_cluster_records_with_memberships(self):
        """
        Returns all concatenated records from cluster analysis
        """

        sort_key = lambda item: tuple(
            (int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item[0].name)
        )
        return sorted(self.concats.values(), key=sort_key)

    def get_cluster_trees(self):
        records = self.get_cluster_records()
        trees = [rec.tree for rec in records]
        return trees

    def get_score(self, key):
        return self.get_partition(key).score

    def get_partition(self, key):
        partition_vector = self.clusters_to_partitions[key]
        return self.partitions[partition_vector]

    def get_membership(self, key, flatten=False):
        return self.get_partition(key).get_membership(flatten=flatten)

    def get_partitions(self):
        return [(k, self.partitions[v]) for (k, v) in self.clusters_to_partitions.items()]

    def get_memberships(self, flatten=False):
        return [
            (k, self.partitions[v].get_membership(flatten=flatten)) for (k, v) in self.clusters_to_partitions.items()
        ]

    def get_scores(self):
        return [(k, self.partitions[v].score) for (k, v) in self.clusters_to_partitions.items()]

    def get_randomised_alignments(self):
        lengths = [rec.seqlength for rec in self.get_records()]
        names = self.get_names()
        datatype = self.records[0].datatype
        concat = self.concatenate_list_of_records()
        concat.shuffle()
        newrecs = concat.split_by_lengths(lengths, names)
        return newrecs

    def get_records(self):
        """
        Returns list of stored sequence records
        """

        return [self.keys_to_records[i] for i in range(self.length)]

    def get_names(self):
        """
        Returns a list of the names of the stored records
        """

        return [rec.name for rec in self.get_records()]

    def get_seqlengths(self):
        """
        Returns a list of the sequence lengths of the stored records
        """

        return [rec.seqlength for rec in self.get_records()]

    def get_distance_matrices(self):
        return self.distance_matrices

    def get_dv_matrices(self):
        dvs = {}
        for rec in self.get_records():
            dvs[rec.name] = rec.dv
        return dvs

    #########################
    # Plotters
    #########################

    def plot_dendrogram(self, metric, link, nclasses, show=True):

        plot_object = self.clustering.plot_dendrogram((metric, link, nclasses))
        if show:
            plot_object.show()
        return plot_object

    def plot_heatmap(self, distance_matrix, partition, outfile=None):

        sort_partition = partition.get_membership(flatten=True)
        fig = distance_matrix.plot_heatmap(sort_partition=sort_partition)
        if outfile:
            fig.savefig("{0}.pdf".format(outfile))
        return fig

    def plot_embedding(
        self,
        partition_vector,
        distance_matrix,
        embedding="MDS",
        prune=True,
        dimensions=3,
        centre_of_mass=False,
        outfile=None,
        standardize=False,
        normalise=False,
        annotate=False,
    ):
        """
        Plots an embedding of the trees in a Principal Coordinate space,
        and saves as pdf.
        """

        dm = distance_matrix.matrix
        partition_vector = np.array(partition_vector)
        labels = self.get_names()
        if embedding == "MDS":
            dbc = self.Clustering.get_double_centre(dm)
            (vals, vecs, var_exp) = self.Clustering.get_eigen(dbc, standardize=standardize)
            (coords, _) = self.Clustering.get_coords_by_dimension(vals, vecs, var_exp, 3, normalise=normalise)
        elif embedding == "spectral":
            laplacian = self.Clustering.spectral(dm, prune=prune)

            (vals, vecs, var_exp) = self.Clustering.get_eigen(laplacian, standardize=standardize)
            (coords, _) = self.Clustering.get_coords_by_dimension(vals, vecs, var_exp, 3, normalise=normalise)
        else:
            print "embedding should be one of 'MDS' or 'spectral'"
            print "value given was:", embedding
            return
        min_Z = min([z for (x, y, z) in coords])
        P = []  # get the indices of the partition vector for each group

        # and store in this list

        max_groups = max(partition_vector)
        for i in range(1, max_groups + 1):
            partition = np.where(partition_vector == i)
            P.append(partition)

        colors = "bgrcmyk"
        coldict = {"b": "blue", "g": "green", "r": "red", "c": "cyan", "m": "magenta", "y": "yellow", "k": "black"}
        fig2d = plt.figure()
        fig3d = plt.figure()
        ax2d = fig2d.add_subplot(111)
        ax3d = fig3d.add_subplot(111, projection="3d")

        for (pos, partition) in enumerate(P):
            for i in partition[0]:
                ax2d.scatter(color=colors[pos % len(colors)], *(coords[i])[:2])
                ax3d.scatter(color=colors[pos % len(colors)], *coords[i])
                ax3d.plot(
                    [coords[i][0], coords[i][0]],
                    [coords[i][1], coords[i][1]],
                    [min_Z, coords[i][2]],
                    color="grey",
                    linewidth=0.2,
                )

                if annotate:
                    ax2d.annotate(
                        labels[i],
                        xy=(coords[i][0], coords[i][1]),
                        xytext=(-20, 20),
                        textcoords="offset points",
                        fontsize="x-small",
                        ha="right",
                        va="bottom",
                        bbox=dict(boxstyle="round,pad=0.5", fc="yellow", alpha=0.5),
                        arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0"),
                    )

            if centre_of_mass:
                com = np.mean(coords[partition], axis=0)
                ax2d.scatter(color="k", marker="x", s=2, *com[:2])
                ax3d.scatter(color="k", marker="x", s=2, *com)
        if embedding == "spectral" and normalise:
            (u, v) = np.mgrid[0 : 2 * np.pi : 20j, 0 : np.pi : 10j]
            x = np.cos(u) * np.sin(v)
            y = np.sin(u) * np.sin(v)
            z = np.cos(v)
            ax3d.plot_wireframe(x, y, z, color="grey", linewidth=0.2)

        ax2d.set_xlabel("PCo1")
        ax2d.set_ylabel("PCo2")
        ax2d.set_title("Trees embedded in dimension-reduced space")
        ax3d.set_xlabel("PCo1")
        ax3d.set_ylabel("PCo2")
        ax3d.set_zlabel("PCo3")
        ax3d.set_title("Trees embedded in dimension-reduced space")
        if outfile:
            fig2d.savefig("{0}-2d.pdf".format(outfile))
            fig3d.savefig("{0}-3d.pdf".format(outfile))
        return (fig2d, fig3d)

    #########################
    # Parallelisers
    #########################

    def _unpack_dv(self, packed_args):
        return packed_args[0].get_dv_matrix(*packed_args[1:])

    def _dv_parallel_call(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True):

        nprocesses = min(self.length, multiprocessing.cpu_count() - 1)
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, self.length)
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in self.get_records():
            new_dir = tmpdir + "/" + rec.name
            if not os.path.isdir(new_dir):
                os.mkdir(new_dir)
            args.append((rec, tmpdir + "/" + rec.name, helper, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_dv, args, callback=results.append)
        r.wait()
        for (w, x, y, z) in args:
            if os.path.isdir(x):
                os.rmdir(x)
        results = results[0]
        print "Results obtained, closing pool..."
        pool.close()
        pool.join()
        print "Pool closed"
        return dict(zip(names, results))

    def put_dv_matrices_parallel(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True):

        dv_matrices_dict = self._dv_parallel_call(tmpdir, helper, overwrite=overwrite)
        for rec in self.get_records():
            rec.dv = [dv_matrices_dict[rec.name]]

    def _unpack_bionj(self, packed_args):
        return packed_args[0].get_bionj_tree(*packed_args[1:])

    def _bionj_parallel_call(self, model=None, datatype=None, rec_list=None, ncat=1, tmpdir="/tmp", overwrite=True):

        if not rec_list:
            rec_list = self.records
        nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1)
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, model, datatype, ncat, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_bionj, args, callback=results.append)
        r.wait()
        print "Results obtained, closing pool..."
        pool.close()
        pool.join()
        print "Pool closed"
        return dict(zip(names, results[0]))

    def _unpack_phyml(self, packed_args):
        return packed_args[0].get_phyml_tree(*packed_args[1:])

    def _phyml_parallel_call(self, model=None, datatype=None, rec_list=None, ncat=4, tmpdir="/tmp", overwrite=True):

        if not rec_list:
            rec_list = self.records
        nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1)
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, model, datatype, ncat, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_phyml, args, callback=results.append)
        r.wait()
        print "Results obtained, closing pool..."
        pool.close()
        pool.join()
        print "Pool closed"
        return dict(zip(names, results[0]))

    def _unpack_raxml(self, packed_args):
        return packed_args[0].get_raxml_tree(*packed_args[1:])

    def _raxml_parallel_call(self, rec_list=None, tmpdir="/tmp", overwrite=True):

        if not rec_list:
            rec_list = self.records
        nprocesses = multiprocessing.cpu_count() - 1
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_raxml, args, callback=results.append)
        r.wait()
        pool.close()
        pool.join()
        return dict(zip(names, results[0]))

    def _unpack_TC(self, packed_args):
        return packed_args[0].get_TC_tree(*packed_args[1:])

    def _TC_parallel_call(self, rec_list=None, tmpdir="/tmp", overwrite=True):

        if not rec_list:
            rec_list = self.records
        nprocesses = multiprocessing.cpu_count() - 1
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_TC, args, callback=results.append)
        r.wait()
        pool.close()
        pool.join()
        return dict(zip(names, results[0]))

    def put_trees_parallel(
        self, rec_list=None, program="treecollection", model=None, datatype=None, ncat=4, tmpdir="/tmp", overwrite=True
    ):

        if not program in ["treecollection", "raxml", "phyml", "bionj"]:
            print "unrecognised program {0}".format(program)
            return
        if not rec_list:
            rec_list = self.records
        if program == "treecollection":
            trees_dict = self._TC_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite)
        elif program == "raxml":
            trees_dict = self._raxml_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite)
        elif program == "phyml":
            trees_dict = self._phyml_parallel_call(
                rec_list=rec_list, model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite
            )
        elif program == "bionj":
            trees_dict = self._bionj_parallel_call(
                rec_list=rec_list, model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite
            )
        for rec in self.get_records():
            rec.tree = trees_dict[rec.name]
            self.inferred_trees[rec.name] = trees_dict[rec.name]

    def put_cluster_trees_parallel(
        self, program="treecollection", model=None, datatype=None, ncat=4, tmpdir="/tmp", overwrite=True
    ):

        if program not in ["treecollection", "raxml", "phyml", "bionj"]:
            print "unrecognised program {0}".format(program)
            return
        rec_list = self.get_cluster_records()
        print "Inferring {0} cluster trees".format(len(rec_list))
        if program == "treecollection":
            cluster_trees_dict = self._TC_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite)
        elif program == "raxml":
            cluster_trees_dict = self._raxml_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite)
        elif program == "phyml":
            cluster_trees_dict = self._phyml_parallel_call(
                rec_list=rec_list, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir, overwrite=overwrite
            )
        elif program == "bionj":
            cluster_trees_dict = self._bionj_parallel_call(
                rec_list=rec_list, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir, overwrite=overwrite
            )
        for rec in rec_list:
            rec.tree = cluster_trees_dict[rec.name]
        self.update_results()

Example #2

Show file

from clustering import Clustering

cl = Clustering('magia_cluster_data')
#cl = Clustering('data_sample')
#cl.generate_clusters(eps=0.5, min_samples=5)

#cl.recommend_n(5931298, 4)

cl.spectral(8)

Example #3

Show file

File: sequence_collection.py Project: haehn/clustering_project

class SequenceCollection(object):
    """
    Orchestrating class that should:
    a) work as a central repository for the information generated by the
       subordinate classes, and
    b) be the only class directly interacted with by the user

    TO DO:
    implement consistent naming of methods (where appropriate)
    Prefixes:
    get_[something]  - returns the object implied by something
    put_[something]  - puts something in the class data structure
    show_[something] - prints something to screen
    plot_[something] - displays a plot of something
    _[something]     - private method
    """
    def __init__(
        self,
        input_dir=None,
        records=None,
        file_format='fasta',
        datatype='protein',
        helper='./class_files/DV_wrapper.drw',
        tmpdir='/tmp',
        get_distances=False,
        parallel_load=False,
        overwrite=True,
    ):

        # Unset Variables

        # Store some mappings for data retrieval

        self.records_to_keys = {}
        self.keys_to_records = {}
        self.clusters_to_partitions = {}
        self.partitions = {}
        self.distance_matrices = {}
        self.concats = {}
        self.inferred_trees = {}
        self.Clustering = Clustering()

        # Store some data

        self.files = None
        self.file_format = file_format
        self.datatype = datatype
        self.records = []
        self.length = 0
        self.helper = helper

        # Set Variables

        self.tmpdir = tmpdir

        # Lambda for sorting by name and number

        sort_key = lambda item: tuple(
            (int(num) if num else alpha)
            for (num, alpha) in re.findall(r'(\d+)|(\D+)', item))

        # Can give an input directory as optional argument
        # If given:
        #    read the alignment files
        #    optionally calculate pairwise distances
        #    store the sequence data

        if input_dir:

            files = self.get_files(input_dir, file_format)

            # file checks

            if files == 0:
                print '!!!'
                print 'There was a problem reading files from {0}'.format(
                    input_dir)
                print '!!!'
                sys.exit()

            if get_distances and not os.path.isfile(helper):
                print '!!!'
                print 'There was a problem finding the darwin helper at {0}'.format(
                    helper)
                print '!!!'
                sys.exit()

            # done

            files.sort(key=sort_key)
            self.put_records(files=files,
                             record_list=None,
                             file_format=file_format,
                             datatype=datatype)

            # takes care of self.length for us

            self.sanitise_records()
            if not os.path.isdir(tmpdir):
                os.mkdir(tmpdir)
        elif records:

            # Can optionally give record objects directly if no input dir specified

            self.put_records(files=None,
                             record_list=records,
                             file_format=file_format,
                             datatype=datatype)

            # takes care of self.length for us

            self.sanitise_records()

        # Optionally use Darwin to calculate pairwise distances

        if get_distances and self.records:
            if parallel_load:
                self.put_dv_matrices_parallel(helper=helper,
                                              tmpdir=tmpdir,
                                              overwrite=overwrite)
            else:
                self.put_dv_matrices(helper=helper,
                                     tmpdir=tmpdir,
                                     overwrite=overwrite)

    def __str__(self):
        s = 'SequenceCollection object:\n'
        s += 'Contains {0} alignments\n'.format(self.length)
        return s

    def __len__(self):
        return self.length

    def get_files(self, input_dir, file_format='fasta'):
        """
        Get list of alignment files from an input directory
        *.fa, *.fas and *.phy files only
        Stores in self.files
        """

        if file_format == 'fasta':
            files = glob.glob('{0}/*.fa'.format(input_dir))
            if len(files) == 0:
                files = glob.glob('{0}/*.fas'.format(input_dir))
        elif file_format == 'phylip':
            files = glob.glob('{0}/*.phy'.format(input_dir))
        else:
            print 'Unrecognised file format %s' % file_format
            files = None
        if not files:
            print 'No sequence files found in {0}'.format(input_dir)
            return 0
        return sorted(files)

    def dump_records(
        self,
        output_dir,
        records=None,
        file_format='phylip',
        use_hashname=True,
    ):
        """
        Dumps all sequence alignment records to an output directory
        Files are dumped in sequential phylip format; by default the
        names are hashed
        """

        directorycheck_and_make(output_dir)

        hash_translation = {}

        if not records:
            records = self.get_records()

        for rec in records:
            filename = rec._write_temp_phylip(output_dir,
                                              use_hashname=use_hashname)
            try:
                hash_translation[str(rec.name)] = filename
            except TypeError:
                print type(rec.name), rec.name, type(filename), filename
        cPickle.dump(hash_translation,
                     open('{0}/hash_translation.pkl'.format(output_dir), 'w'))

    def hash(self, string):
        H = hashlib.sha1(string)
        return H.hexdigest()

    def gzip(self, filename):

        if not filename.endswith('.gz'):
            filename += '.gz'

        cPickle.dump(self, file=gz.open(filename, 'wb'), protocol=-1)

    @classmethod
    def gunzip(cls, filename):

        return cPickle.load(gz.open(filename, 'rb'))

    def put_records(
        self,
        files=None,
        record_list=None,
        file_format='fasta',
        datatype='protein',
    ):
        """
        Reads sequence files from the list generated by
        get_files and stores in self.records
        """

        get_name = lambda i: i[i.rindex('/') + 1:i.rindex('.')]

        if files and not record_list:
            record_list = [
                TCSeqRec(f,
                         file_format=file_format,
                         name=get_name(f),
                         datatype=datatype) for f in files
            ]
        elif not files and not record_list:

            print 'Can\'t load records - no records or alignment files given'
            return

        records_to_keys = dict([(record.name, number)
                                for (number, record) in enumerate(record_list)
                                ])
        keys_to_records = dict(enumerate(record_list))
        self.records = record_list
        self.length = len(record_list)
        self.records_to_keys = records_to_keys
        self.keys_to_records = keys_to_records

    def load_phyml_results(
        self,
        input_dir,
        records=None,
        use_hashname=False,
        program='phyml',
    ):

        if not records:
            records = self.get_records()
        failures = []
        for rec in records:
            if use_hashname:
                name = rec.hashname()
            else:
                name = rec.name
            tree_file = '{0}/{1}.phy_phyml_tree.txt'.format(input_dir, name)
            stats_file = \
                '{0}/{1}.phy_phyml_stats.txt'.format(input_dir, name)

            try:
                rec.tree.load_phyml_results(tree_file,
                                            stats_file,
                                            name=rec.name,
                                            program=program)
            except FileError:
                failures.append(rec.name)

        if failures:
            print 'Couldn\'t load results for the following records:'
            for f in failures:
                print '   ', f

    def sanitise_records(self):
        """
        Sorts records alphabetically, trims whitespace from beginning
        of record headers, removes '/' characters from headers,
        replaces spaces with underscores, puts sequences into upper case
        """

        for rec in self.get_records():
            rec.sanitise()

    def put_dv_matrices(
        self,
        tmpdir='/tmp',
        helper='./class_files/DV_wrapper.drw',
        overwrite=True,
    ):

        for rec in self.get_records():
            rec.dv = [
                rec.get_dv_matrix(tmpdir=tmpdir,
                                  helper=helper,
                                  overwrite=overwrite)
            ]

    def put_trees(
        self,
        rec_list=None,
        program='treecollection',
        model=None,
        datatype=None,
        ncat=4,
        optimise='n',
        tmpdir=None,
        overwrite=True,
        verbose=False,
    ):

        if tmpdir is None:
            tmpdir = self.tmpdir
        if not program in ['treecollection', 'raxml', 'phyml', 'bionj']:
            print 'unrecognised program {0}'.format(program)
            return
        if not rec_list:
            rec_list = self.records
        for rec in rec_list:
            if overwrite is False:
                if rec.name in self.inferred_trees:
                    continue
            if program == 'treecollection':
                tree = rec.get_TC_tree(tmpdir=tmpdir, overwrite=overwrite)
            elif program == 'raxml':
                tree = rec.get_raxml_tree(tmpdir=tmpdir, overwrite=overwrite)
            elif program == 'phyml':
                tree = rec.get_phyml_tree(
                    model=model,
                    datatype=datatype,
                    tmpdir=tmpdir,
                    ncat=ncat,
                    overwrite=overwrite,
                    verbose=verbose,
                )
            elif program == 'bionj':
                tree = rec.get_bionj_tree(
                    model=model,
                    datatype=datatype,
                    tmpdir=tmpdir,
                    ncat=ncat,
                    optimise=optimise,
                    overwrite=overwrite,
                    verbose=verbose,
                )
            self.inferred_trees[rec.name] = tree

    def put_distance_matrices(
        self,
        metrics,
        tmpdir='/tmp',
        normalise=False,
    ):
        """
        Pass this function a list of metrics
        valid kwargs - invert (bool), normalise (bool)
        """

        if not isinstance(metrics, list):
            metrics = [metrics]
        trees = [rec.tree for rec in self.get_records()]
        for metric in metrics:
            dm = DistanceMatrix(trees, tmpdir=tmpdir)
            dm.get_distance_matrix(metric, normalise=normalise)
            self.distance_matrices[metric] = dm

    def put_partition(
        self,
        metric,
        cluster_method,
        nclusters,
        prune=True,
        tmpdir=None,
        recalculate=False,
    ):

        if not tmpdir:
            tmpdir = self.tmpdir
        if not metric in self.get_distance_matrices():
            self.put_distance_matrices(metric, tmpdir=tmpdir)
        partition_vector = \
            self.Clustering.run_clustering(self.distance_matrices[metric],
                cluster_method, nclusters, prune=prune,
                recalculate=recalculate)

        self.clusters_to_partitions[(metric, cluster_method,
                                     nclusters)] = partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)
        return partition_vector

    def put_partition_vector(self, partition_vector, name):
        """
        Given a partition vector (i.e. a tuple containing the class-
        membership for each gene alignment), inserts the relevant data
        structures into the SequenceCollection object.
        NEXT: run concatenate_records(), put_cluster_trees()
        """

        self.clusters_to_partitions[name] = partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)

    def put_partitions(
        self,
        metrics,
        cluster_methods,
        nclusters,
        prune=True,
        tmpdir=None,
        recalculate=False,
    ):
        """
        metrics, linkages and nclasses are given as lists, or coerced into
        lists
        """

        if not isinstance(metrics, list):
            metrics = [metrics]
        if not isinstance(cluster_methods, list):
            cluster_methods = [cluster_methods]
        if not isinstance(nclusters, list):
            nclusters = [nclusters]
        if tmpdir is None:
            tmpdir = self.tmpdir
        else:
            nclusters = sorted(nclusters, reverse=True)

        # names = [rec.name for rec in self.get_records()]

        for metric in metrics:
            print 'Clustering {0} data'.format(metric)
            self.Clustering.clear_cache()
            for cluster_method in cluster_methods:
                print ' ', cluster_method
                for n in nclusters:
                    key = (metric, cluster_method, n)
                    if key in self.clusters_to_partitions:
                        continue
                    else:
                        self.put_partition(
                            metric,
                            cluster_method,
                            n,
                            prune=prune,
                            tmpdir=tmpdir,
                            recalculate=recalculate,
                        )

    def concatenate_records(self):
        for p in self.partitions.values():
            p.concatenate_records(self.keys_to_records)
            for concat in p.concats:
                if not concat[0].name in self.concats:
                    self.concats[concat[0].name] = concat

    def autotune(
        self,
        metric,
        prune=True,
        KMeans=True,
        recalculate=True,
        tmpdir=None,
        max_groups=None,
        min_groups=2,
        check_single=True,
    ):
        """
        Uses Perona and Zelnick-Manor's spectral rotation method to determine
        the number of clusters present in the data
        """

        if not tmpdir:
            tmpdir = self.tmpdir
        if not metric in self.get_distance_matrices():
            self.put_distance_matrices(metric, tmpdir=tmpdir)
        dm = self.get_distance_matrices()[metric]

        if check_single and min_groups > 1:
            print 'Checking for single cluster...'
            (partition_vector, nclusters, quality_scores) = \
                self.Clustering.run_spectral_rotate(
                dm,
                prune=prune,
                KMeans=KMeans,
                recalculate=recalculate,
                max_groups=6,
                min_groups=1,
                verbose=False,
                )
            if nclusters == 1:
                print 'Single cluster found.'
                print 'Quality Scores: {0}'.format(quality_scores)

                self.clusters_to_partitions[(metric, 'rotate',
                                             nclusters)] = partition_vector
                self.partitions[partition_vector] = \
                    Partition(partition_vector)
                return (partition_vector, quality_scores)
            else:
                print '>1 clusters found.'
                print 'Quality Scores: {0}'.format(quality_scores)
                recalculate = False

        (partition_vector, nclusters, quality_scores) = \
            self.Clustering.run_spectral_rotate(
            dm,
            prune=prune,
            KMeans=KMeans,
            recalculate=recalculate,
            max_groups=max_groups,
            min_groups=min_groups,
            )

        self.clusters_to_partitions[(metric, 'rotate', nclusters)] = \
            partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)
        return (partition_vector, quality_scores)

    def put_cluster_trees(
        self,
        program='treecollection',
        model=None,
        datatype=None,
        ncat=4,
        optimise='n',
        tmpdir='/tmp',
        overwrite=True,
        max_guide_trees=True,
    ):

        if program not in ['treecollection', 'raxml', 'phyml', 'bionj']:
            print 'unrecognised program {0}'.format(program)
            return
        if program == 'treecollection':
            return self._put_best_TC_trees(tmpdir=tmpdir,
                                           overwrite=overwrite,
                                           max_guide_trees=max_guide_trees)
        rec_list = self.get_cluster_records()
        print 'Inferring {0} cluster trees'.format(len(rec_list))
        self.put_trees(
            rec_list=rec_list,
            program=program,
            model=model,
            ncat=ncat,
            optimise=optimise,
            datatype=datatype,
            tmpdir=tmpdir,
            overwrite=overwrite,
        )
        self.update_scores()

    def _put_best_TC_trees(
        self,
        tmpdir='/tmp',
        overwrite=True,
        max_guide_trees=-1,
    ):
        rec_list = self.get_cluster_records_with_memberships()
        for (rec, members) in rec_list:
            print 'Calculating treecollection tree for {0}'.format(rec.name),
            if rec.name in self.inferred_trees and overwrite == False:
                print 'Skipping - already calculated (overwrite set to False)'
                continue
            guidetrees = [
                self.keys_to_records[member].tree for member in members
            ]
            if max_guide_trees > 0:
                guidetrees = guidetrees[:max_guide_trees]
            TCtrees = []
            pref = rec._write_temp_tc(make_guide_tree=False, tmpdir=tmpdir)
            pref = '{0}/{1}'.format(tmpdir, pref)
            dv_file = pref + '_dv.txt'
            labels_file = pref + '_labels.txt'
            map_file = pref + '_map.txt'
            if len(guidetrees) > 1:
                print '(using best of {0} guidetrees)'.format(len(guidetrees))
            else:
                print '(using single guidetree)'
            for t in guidetrees:
                guidetree_file = '{0}/{1}.nwk'.format(tmpdir, t.name)
                n = t.reroot_newick()
                with open(guidetree_file, 'w') as writer:
                    writer.write(n)
                TCtrees.append(
                    Tree.new_treecollection_tree(dv_file, map_file,
                                                 labels_file, guidetree_file,
                                                 rec.name))
            best = min(TCtrees, key=lambda x: x.score)
            rec.tree = best
            self.inferred_trees[rec.name] = best
        self.update_scores()

    def update_scores(self):
        for partition in self.partitions.values():
            partition.update_score(self.concats)

    def _pivot(lst):
        new_lst = zip(*lst)
        return [''.join(x) for x in new_lst]

    def concatenate_list_of_records(self, records=None):
        if not records:
            records = self.get_records()
        concat = copy.deepcopy(records[0])
        for rec in records[1:]:
            concat += rec
        return concat

    def make_randomised_copy(
        self,
        tmpdir=None,
        get_distances=False,
        parallel_load=False,
        overwrite=True,
    ):

        shuffled_records = self.get_randomised_alignments()
        if not tmpdir:
            tmpdir = self.tmpdir
        randomised_copy = SequenceCollection(
            input_dir=None,
            records=shuffled_records,
            file_format=self.file_format,
            datatype=self.datatype,
            helper=self.helper,
            tmpdir=tmpdir,
            get_distances=get_distances,
            parallel_load=parallel_load,
            overwrite=overwrite,
        )
        return randomised_copy

    def show_memberships(self):

        partitions = self.get_partitions()
        for compound_key in partitions:
            print ' '.join(str(x) for x in compound_key)
            partition = partitions[compound_key]
            print partition
            print self.clustering.get_memberships(partition)

    def simulate_from_record(
        self,
        record,
        output_dir,
        name,
        tmpdir,
        datatype=None,
        allow_nonsense=False,
        split_lengths=None,
        gene_names=None,
    ):

        if not datatype:
            datatype = self.datatype
        if datatype == 'protein':
            SeqSim.simulate_from_record_WAG(
                record,
                output_dir,
                name,
                tmpdir,
                allow_nonsense,
                split_lengths,
                gene_names,
            )
        elif datatype == 'dna':
            SeqSim.simulate_from_record_GTR(
                record,
                output_dir,
                name,
                tmpdir,
                allow_nonsense,
                split_lengths,
                gene_names,
            )
        else:
            print 'datatype {0} is not recognised'.format(datatype)

    def simulate_from_result(
        self,
        key,
        output_dir,
        name,
        tmpdir,
        datatype=None,
        allow_nonsense=False,
    ):

        if not datatype:
            datatype = self.datatype
        p = self.get_partition(key)
        for c in p.concats:
            updated_record = self.concats[c.name][
                0]  # bug: records in Partition

            # objects aren't linked
            # to trees

            members = c.name.split('-')
            lengths = [self.keys_to_records[int(x)].seqlength for x in members]
            names = [
                'sim' + self.keys_to_records[int(x)].name for x in members
            ]
            self.simulate_from_record(
                updated_record,
                output_dir,
                name=name,
                tmpdir=tmpdir,
                allow_nonsense=allow_nonsense,
                split_lengths=lengths,
                gene_names=names,
            )

#######################
# Getters
#######################

    def get_trees(self):
        return [rec.tree for rec in self.get_records()]

    def get_cluster_records(self):
        """
        Returns all concatenated records from cluster analysis
        """

        sort_key = lambda item: tuple(
            (int(num) if num else alpha)
            for (num, alpha) in re.findall(r'(\d+)|(\D+)', item[0].name))
        return [
            rec for (rec, _) in sorted(self.concats.values(), key=sort_key)
        ]

    def get_cluster_records_with_memberships(self):
        """
        Returns all concatenated records from cluster analysis
        """

        sort_key = lambda item: tuple(
            (int(num) if num else alpha)
            for (num, alpha) in re.findall(r'(\d+)|(\D+)', item[0].name))
        return sorted(self.concats.values(), key=sort_key)

    def get_cluster_trees(self):
        records = self.get_cluster_records()
        trees = [rec.tree for rec in records]
        return trees

    def get_score(self, key):
        return self.get_partition(key).score

    def get_partition(self, key):
        partition_vector = self.clusters_to_partitions[key]
        return self.partitions[partition_vector]

    def get_membership(self, key, flatten=False):
        return self.get_partition(key).get_membership(flatten=flatten)

    def get_partitions(self):
        return [(k, self.partitions[v])
                for (k, v) in self.clusters_to_partitions.items()]

    def get_memberships(self, flatten=False):
        return [(k, self.partitions[v].get_membership(flatten=flatten))
                for (k, v) in self.clusters_to_partitions.items()]

    def get_scores(self):
        return [(k, self.partitions[v].score)
                for (k, v) in self.clusters_to_partitions.items()]

    def get_randomised_alignments(self):
        lengths = [rec.seqlength for rec in self.get_records()]
        names = self.get_names()
        datatype = self.records[0].datatype
        concat = self.concatenate_list_of_records()
        concat.shuffle()
        newrecs = concat.split_by_lengths(lengths, names)
        return newrecs

    def get_records(self):
        """
        Returns list of stored sequence records
        """

        return [self.keys_to_records[i] for i in range(self.length)]

    def get_names(self):
        """
        Returns a list of the names of the stored records
        """

        return [rec.name for rec in self.get_records()]

    def get_seqlengths(self):
        """
        Returns a list of the sequence lengths of the stored records
        """

        return [rec.seqlength for rec in self.get_records()]

    def get_distance_matrices(self):
        return self.distance_matrices

    def get_dv_matrices(self):
        dvs = {}
        for rec in self.get_records():
            dvs[rec.name] = rec.dv
        return dvs

#########################
# Plotters
#########################

    def plot_dendrogram(
        self,
        metric,
        link,
        nclasses,
        show=True,
    ):

        plot_object = self.clustering.plot_dendrogram((metric, link, nclasses))
        if show:
            plot_object.show()
        return plot_object

    def plot_heatmap(
        self,
        distance_matrix,
        partition,
        outfile=None,
    ):

        sort_partition = partition.get_membership(flatten=True)
        fig = \
            distance_matrix.plot_heatmap(sort_partition=sort_partition)
        if outfile:
            fig.savefig('{0}.pdf'.format(outfile))
        return fig

    def plot_embedding(
        self,
        partition_vector,
        distance_matrix,
        embedding='MDS',
        prune=True,
        dimensions=3,
        centre_of_mass=False,
        outfile=None,
        standardize=False,
        normalise=False,
        annotate=False,
    ):
        """
        Plots an embedding of the trees in a Principal Coordinate space,
        and saves as pdf.
        """

        dm = distance_matrix.matrix
        partition_vector = np.array(partition_vector)
        labels = self.get_names()
        if embedding == 'MDS':
            dbc = self.Clustering.get_double_centre(dm)
            (vals, vecs,
             var_exp) = self.Clustering.get_eigen(dbc, standardize=standardize)
            (coords,
             _) = self.Clustering.get_coords_by_dimension(vals,
                                                          vecs,
                                                          var_exp,
                                                          3,
                                                          normalise=normalise)
        elif embedding == 'spectral':
            laplacian = self.Clustering.spectral(dm, prune=prune)

            (vals, vecs, var_exp) = \
                self.Clustering.get_eigen(laplacian,
                    standardize=standardize)
            (coords,
             _) = self.Clustering.get_coords_by_dimension(vals,
                                                          vecs,
                                                          var_exp,
                                                          3,
                                                          normalise=normalise)
        else:
            print 'embedding should be one of \'MDS\' or \'spectral\''
            print 'value given was:', embedding
            return
        min_Z = min([z for (x, y, z) in coords])
        P = []  # get the indices of the partition vector for each group

        # and store in this list

        max_groups = max(partition_vector)
        for i in range(1, max_groups + 1):
            partition = np.where(partition_vector == i)
            P.append(partition)

        colors = 'bgrcmyk'
        coldict = {
            'b': 'blue',
            'g': 'green',
            'r': 'red',
            'c': 'cyan',
            'm': 'magenta',
            'y': 'yellow',
            'k': 'black',
        }
        fig2d = plt.figure()
        fig3d = plt.figure()
        ax2d = fig2d.add_subplot(111)
        ax3d = fig3d.add_subplot(111, projection='3d')

        for (pos, partition) in enumerate(P):
            for i in partition[0]:
                ax2d.scatter(color=colors[pos % len(colors)], *(coords[i])[:2])
                ax3d.scatter(color=colors[pos % len(colors)], *coords[i])
                ax3d.plot([coords[i][0], coords[i][0]],
                          [coords[i][1], coords[i][1]], [min_Z, coords[i][2]],
                          color='grey',
                          linewidth=0.2)

                if annotate:
                    ax2d.annotate(
                        labels[i],
                        xy=(coords[i][0], coords[i][1]),
                        xytext=(-20, 20),
                        textcoords='offset points',
                        fontsize='x-small',
                        ha='right',
                        va='bottom',
                        bbox=dict(boxstyle='round,pad=0.5',
                                  fc='yellow',
                                  alpha=0.5),
                        arrowprops=dict(arrowstyle='->',
                                        connectionstyle='arc3,rad=0'),
                    )

            if centre_of_mass:
                com = np.mean(coords[partition], axis=0)
                ax2d.scatter(color='k', marker='x', s=2, *com[:2])
                ax3d.scatter(color='k', marker='x', s=2, *com)
        if embedding == 'spectral' and normalise:
            (u, v) = np.mgrid[0:2 * np.pi:20j, 0:np.pi:10j]
            x = np.cos(u) * np.sin(v)
            y = np.sin(u) * np.sin(v)
            z = np.cos(v)
            ax3d.plot_wireframe(x, y, z, color='grey', linewidth=0.2)

        ax2d.set_xlabel('PCo1')
        ax2d.set_ylabel('PCo2')
        ax2d.set_title('Trees embedded in dimension-reduced space')
        ax3d.set_xlabel('PCo1')
        ax3d.set_ylabel('PCo2')
        ax3d.set_zlabel('PCo3')
        ax3d.set_title('Trees embedded in dimension-reduced space')
        if outfile:
            fig2d.savefig('{0}-2d.pdf'.format(outfile))
            fig3d.savefig('{0}-3d.pdf'.format(outfile))
        return (fig2d, fig3d)

#########################
# Parallelisers
#########################

    def _unpack_dv(self, packed_args):
        return packed_args[0].get_dv_matrix(*packed_args[1:])

    def _dv_parallel_call(
        self,
        tmpdir='/tmp',
        helper='./class_files/DV_wrapper.drw',
        overwrite=True,
    ):

        nprocesses = min(self.length, multiprocessing.cpu_count() - 1)
        print 'Initialising a pool of {0} processes running {1} jobs...'.format(
            nprocesses, self.length)
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in self.get_records():
            new_dir = tmpdir + '/' + rec.name
            if not os.path.isdir(new_dir):
                os.mkdir(new_dir)
            args.append((rec, tmpdir + '/' + rec.name, helper, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_dv, args, callback=results.append)
        r.wait()
        for (w, x, y, z) in args:
            if os.path.isdir(x):
                os.rmdir(x)
        results = results[0]
        print 'Results obtained, closing pool...'
        pool.close()
        pool.join()
        print 'Pool closed'
        return dict(zip(names, results))

    def put_dv_matrices_parallel(
        self,
        tmpdir='/tmp',
        helper='./class_files/DV_wrapper.drw',
        overwrite=True,
    ):

        dv_matrices_dict = self._dv_parallel_call(tmpdir,
                                                  helper,
                                                  overwrite=overwrite)
        for rec in self.get_records():
            rec.dv = [dv_matrices_dict[rec.name]]

    def _unpack_bionj(self, packed_args):
        return packed_args[0].get_bionj_tree(*packed_args[1:])

    def _bionj_parallel_call(
        self,
        model=None,
        datatype=None,
        rec_list=None,
        ncat=1,
        tmpdir='/tmp',
        overwrite=True,
    ):

        if not rec_list:
            rec_list = self.records
        nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1)
        print 'Initialising a pool of {0} processes running {1} jobs...'.format(
            nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((
                rec,
                model,
                datatype,
                ncat,
                tmpdir,
                overwrite,
            ))
            names.append(rec.name)
        r = pool.map_async(self._unpack_bionj, args, callback=results.append)
        r.wait()
        print 'Results obtained, closing pool...'
        pool.close()
        pool.join()
        print 'Pool closed'
        return dict(zip(names, results[0]))

    def _unpack_phyml(self, packed_args):
        return packed_args[0].get_phyml_tree(*packed_args[1:])

    def _phyml_parallel_call(
        self,
        model=None,
        datatype=None,
        rec_list=None,
        ncat=4,
        tmpdir='/tmp',
        overwrite=True,
    ):

        if not rec_list:
            rec_list = self.records
        nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1)
        print 'Initialising a pool of {0} processes running {1} jobs...'.format(
            nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((
                rec,
                model,
                datatype,
                ncat,
                tmpdir,
                overwrite,
            ))
            names.append(rec.name)
        r = pool.map_async(self._unpack_phyml, args, callback=results.append)
        r.wait()
        print 'Results obtained, closing pool...'
        pool.close()
        pool.join()
        print 'Pool closed'
        return dict(zip(names, results[0]))

    def _unpack_raxml(self, packed_args):
        return packed_args[0].get_raxml_tree(*packed_args[1:])

    def _raxml_parallel_call(
        self,
        rec_list=None,
        tmpdir='/tmp',
        overwrite=True,
    ):

        if not rec_list:
            rec_list = self.records
        nprocesses = multiprocessing.cpu_count() - 1
        print 'Initialising a pool of {0} processes running {1} jobs...'.format(
            nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_raxml, args, callback=results.append)
        r.wait()
        pool.close()
        pool.join()
        return dict(zip(names, results[0]))

    def _unpack_TC(self, packed_args):
        return packed_args[0].get_TC_tree(*packed_args[1:])

    def _TC_parallel_call(
        self,
        rec_list=None,
        tmpdir='/tmp',
        overwrite=True,
    ):

        if not rec_list:
            rec_list = self.records
        nprocesses = multiprocessing.cpu_count() - 1
        print 'Initialising a pool of {0} processes running {1} jobs...'.format(
            nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_TC, args, callback=results.append)
        r.wait()
        pool.close()
        pool.join()
        return dict(zip(names, results[0]))

    def put_trees_parallel(
        self,
        rec_list=None,
        program='treecollection',
        model=None,
        datatype=None,
        ncat=4,
        tmpdir='/tmp',
        overwrite=True,
    ):

        if not program in ['treecollection', 'raxml', 'phyml', 'bionj']:
            print 'unrecognised program {0}'.format(program)
            return
        if not rec_list:
            rec_list = self.records
        if program == 'treecollection':
            trees_dict = self._TC_parallel_call(rec_list=rec_list,
                                                tmpdir=tmpdir,
                                                overwrite=overwrite)
        elif program == 'raxml':
            trees_dict = self._raxml_parallel_call(rec_list=rec_list,
                                                   tmpdir=tmpdir,
                                                   overwrite=overwrite)
        elif program == 'phyml':
            trees_dict = self._phyml_parallel_call(
                rec_list=rec_list,
                model=model,
                datatype=datatype,
                tmpdir=tmpdir,
                ncat=ncat,
                overwrite=overwrite,
            )
        elif program == 'bionj':
            trees_dict = self._bionj_parallel_call(
                rec_list=rec_list,
                model=model,
                datatype=datatype,
                tmpdir=tmpdir,
                ncat=ncat,
                overwrite=overwrite,
            )
        for rec in self.get_records():
            rec.tree = trees_dict[rec.name]
            self.inferred_trees[rec.name] = trees_dict[rec.name]

    def put_cluster_trees_parallel(
        self,
        program='treecollection',
        model=None,
        datatype=None,
        ncat=4,
        tmpdir='/tmp',
        overwrite=True,
    ):

        if program not in ['treecollection', 'raxml', 'phyml', 'bionj']:
            print 'unrecognised program {0}'.format(program)
            return
        rec_list = self.get_cluster_records()
        print 'Inferring {0} cluster trees'.format(len(rec_list))
        if program == 'treecollection':
            cluster_trees_dict = \
                self._TC_parallel_call(rec_list=rec_list,
                    tmpdir=tmpdir, overwrite=overwrite)
        elif program == 'raxml':
            cluster_trees_dict = \
                self._raxml_parallel_call(rec_list=rec_list,
                    tmpdir=tmpdir, overwrite=overwrite)
        elif program == 'phyml':
            cluster_trees_dict = self._phyml_parallel_call(
                rec_list=rec_list,
                model=model,
                datatype=datatype,
                ncat=ncat,
                tmpdir=tmpdir,
                overwrite=overwrite,
            )
        elif program == 'bionj':
            cluster_trees_dict = self._bionj_parallel_call(
                rec_list=rec_list,
                model=model,
                datatype=datatype,
                ncat=ncat,
                tmpdir=tmpdir,
                overwrite=overwrite,
            )
        for rec in rec_list:
            rec.tree = cluster_trees_dict[rec.name]
        self.update_results()