class SequenceCollection(object):

    """
    Orchestrating class that should:
    a) work as a central repository for the information generated by the
       subordinate classes, and
    b) be the only class directly interacted with by the user

    TO DO:
    implement consistent naming of methods (where appropriate)
    Prefixes:
    get_[something]  - returns the object implied by something
    put_[something]  - puts something in the class data structure
    show_[something] - prints something to screen
    plot_[something] - displays a plot of something
    _[something]     - private method
    """

    def __init__(
        self,
        input_dir=None,
        records=None,
        file_format="fasta",
        datatype="protein",
        helper="./class_files/DV_wrapper.drw",
        tmpdir="/tmp",
        get_distances=False,
        parallel_load=False,
        overwrite=True,
    ):

        # Unset Variables

        # Store some mappings for data retrieval

        self.records_to_keys = {}
        self.keys_to_records = {}
        self.clusters_to_partitions = {}
        self.partitions = {}
        self.distance_matrices = {}
        self.concats = {}
        self.inferred_trees = {}
        self.Clustering = Clustering()

        # Store some data

        self.files = None
        self.file_format = file_format
        self.datatype = datatype
        self.records = []
        self.length = 0
        self.helper = helper

        # Set Variables

        self.tmpdir = tmpdir

        # Lambda for sorting by name and number

        sort_key = lambda item: tuple((int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item))

        # Can give an input directory as optional argument
        # If given:
        #    read the alignment files
        #    optionally calculate pairwise distances
        #    store the sequence data

        if input_dir:

            files = self.get_files(input_dir, file_format)

            # file checks

            if files == 0:
                print "!!!"
                print "There was a problem reading files from {0}".format(input_dir)
                print "!!!"
                sys.exit()

            if get_distances and not os.path.isfile(helper):
                print "!!!"
                print "There was a problem finding the darwin helper at {0}".format(helper)
                print "!!!"
                sys.exit()

            # done

            files.sort(key=sort_key)
            self.put_records(files=files, record_list=None, file_format=file_format, datatype=datatype)

            # takes care of self.length for us

            self.sanitise_records()
            if not os.path.isdir(tmpdir):
                os.mkdir(tmpdir)
        elif records:

            # Can optionally give record objects directly if no input dir specified

            self.put_records(files=None, record_list=records, file_format=file_format, datatype=datatype)

            # takes care of self.length for us

            self.sanitise_records()

        # Optionally use Darwin to calculate pairwise distances

        if get_distances and self.records:
            if parallel_load:
                self.put_dv_matrices_parallel(helper=helper, tmpdir=tmpdir, overwrite=overwrite)
            else:
                self.put_dv_matrices(helper=helper, tmpdir=tmpdir, overwrite=overwrite)

    def __str__(self):
        s = "SequenceCollection object:\n"
        s += "Contains {0} alignments\n".format(self.length)
        return s

    def __len__(self):
        return self.length

    def get_files(self, input_dir, file_format="fasta"):
        """
        Get list of alignment files from an input directory
        *.fa, *.fas and *.phy files only
        Stores in self.files
        """

        if file_format == "fasta":
            files = glob.glob("{0}/*.fa".format(input_dir))
            if len(files) == 0:
                files = glob.glob("{0}/*.fas".format(input_dir))
        elif file_format == "phylip":
            files = glob.glob("{0}/*.phy".format(input_dir))
        else:
            print "Unrecognised file format %s" % file_format
            files = None
        if not files:
            print "No sequence files found in {0}".format(input_dir)
            return 0
        return sorted(files)

    def dump_records(self, output_dir, records=None, file_format="phylip", use_hashname=True):
        """
        Dumps all sequence alignment records to an output directory
        Files are dumped in sequential phylip format; by default the
        names are hashed
        """

        directorycheck_and_make(output_dir)

        hash_translation = {}

        if not records:
            records = self.get_records()

        for rec in records:
            filename = rec._write_temp_phylip(output_dir, use_hashname=use_hashname)
            try:
                hash_translation[str(rec.name)] = filename
            except TypeError:
                print type(rec.name), rec.name, type(filename), filename
        cPickle.dump(hash_translation, open("{0}/hash_translation.pkl".format(output_dir), "w"))

    def hash(self, string):
        H = hashlib.sha1(string)
        return H.hexdigest()

    def gzip(self, filename):

        if not filename.endswith(".gz"):
            filename += ".gz"

        cPickle.dump(self, file=gz.open(filename, "wb"), protocol=-1)

    @classmethod
    def gunzip(cls, filename):

        return cPickle.load(gz.open(filename, "rb"))

    def put_records(self, files=None, record_list=None, file_format="fasta", datatype="protein"):
        """
        Reads sequence files from the list generated by
        get_files and stores in self.records
        """

        get_name = lambda i: i[i.rindex("/") + 1 : i.rindex(".")]

        if files and not record_list:
            record_list = [TCSeqRec(f, file_format=file_format, name=get_name(f), datatype=datatype) for f in files]
        elif not files and not record_list:

            print "Can't load records - no records or alignment files given"
            return

        records_to_keys = dict([(record.name, number) for (number, record) in enumerate(record_list)])
        keys_to_records = dict(enumerate(record_list))
        self.records = record_list
        self.length = len(record_list)
        self.records_to_keys = records_to_keys
        self.keys_to_records = keys_to_records

    def load_phyml_results(self, input_dir, records=None, use_hashname=False, program="phyml"):

        if not records:
            records = self.get_records()
        failures = []
        for rec in records:
            if use_hashname:
                name = rec.hashname()
            else:
                name = rec.name
            tree_file = "{0}/{1}.phy_phyml_tree.txt".format(input_dir, name)
            stats_file = "{0}/{1}.phy_phyml_stats.txt".format(input_dir, name)

            try:
                rec.tree.load_phyml_results(tree_file, stats_file, name=rec.name, program=program)
            except FileError:
                failures.append(rec.name)

        if failures:
            print "Couldn't load results for the following records:"
            for f in failures:
                print "   ", f

    def sanitise_records(self):
        """
        Sorts records alphabetically, trims whitespace from beginning
        of record headers, removes '/' characters from headers,
        replaces spaces with underscores, puts sequences into upper case
        """

        for rec in self.get_records():
            rec.sanitise()

    def put_dv_matrices(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True):

        for rec in self.get_records():
            rec.dv = [rec.get_dv_matrix(tmpdir=tmpdir, helper=helper, overwrite=overwrite)]

    def put_trees(
        self,
        rec_list=None,
        program="treecollection",
        model=None,
        datatype=None,
        ncat=4,
        optimise="n",
        tmpdir=None,
        overwrite=True,
        verbose=False,
    ):

        if tmpdir is None:
            tmpdir = self.tmpdir
        if not program in ["treecollection", "raxml", "phyml", "bionj"]:
            print "unrecognised program {0}".format(program)
            return
        if not rec_list:
            rec_list = self.records
        for rec in rec_list:
            if overwrite is False:
                if rec.name in self.inferred_trees:
                    continue
            if program == "treecollection":
                tree = rec.get_TC_tree(tmpdir=tmpdir, overwrite=overwrite)
            elif program == "raxml":
                tree = rec.get_raxml_tree(tmpdir=tmpdir, overwrite=overwrite)
            elif program == "phyml":
                tree = rec.get_phyml_tree(
                    model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite, verbose=verbose
                )
            elif program == "bionj":
                tree = rec.get_bionj_tree(
                    model=model,
                    datatype=datatype,
                    tmpdir=tmpdir,
                    ncat=ncat,
                    optimise=optimise,
                    overwrite=overwrite,
                    verbose=verbose,
                )
            self.inferred_trees[rec.name] = tree

    def put_distance_matrices(self, metrics, tmpdir="/tmp", normalise=False):
        """
        Pass this function a list of metrics
        valid kwargs - invert (bool), normalise (bool)
        """

        if not isinstance(metrics, list):
            metrics = [metrics]
        trees = [rec.tree for rec in self.get_records()]
        for metric in metrics:
            dm = DistanceMatrix(trees, tmpdir=tmpdir)
            dm.get_distance_matrix(metric, normalise=normalise)
            self.distance_matrices[metric] = dm

    def put_partition(self, metric, cluster_method, nclusters, prune=True, tmpdir=None, recalculate=False):

        if not tmpdir:
            tmpdir = self.tmpdir
        if not metric in self.get_distance_matrices():
            self.put_distance_matrices(metric, tmpdir=tmpdir)
        partition_vector = self.Clustering.run_clustering(
            self.distance_matrices[metric], cluster_method, nclusters, prune=prune, recalculate=recalculate
        )

        self.clusters_to_partitions[(metric, cluster_method, nclusters)] = partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)
        return partition_vector

    def put_partition_vector(self, partition_vector, name):
        """
        Given a partition vector (i.e. a tuple containing the class-
        membership for each gene alignment), inserts the relevant data
        structures into the SequenceCollection object.
        NEXT: run concatenate_records(), put_cluster_trees()
        """

        self.clusters_to_partitions[name] = partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)

    def put_partitions(self, metrics, cluster_methods, nclusters, prune=True, tmpdir=None, recalculate=False):
        """
        metrics, linkages and nclasses are given as lists, or coerced into
        lists
        """

        if not isinstance(metrics, list):
            metrics = [metrics]
        if not isinstance(cluster_methods, list):
            cluster_methods = [cluster_methods]
        if not isinstance(nclusters, list):
            nclusters = [nclusters]
        if tmpdir is None:
            tmpdir = self.tmpdir
        else:
            nclusters = sorted(nclusters, reverse=True)

        # names = [rec.name for rec in self.get_records()]

        for metric in metrics:
            print "Clustering {0} data".format(metric)
            self.Clustering.clear_cache()
            for cluster_method in cluster_methods:
                print " ", cluster_method
                for n in nclusters:
                    key = (metric, cluster_method, n)
                    if key in self.clusters_to_partitions:
                        continue
                    else:
                        self.put_partition(
                            metric, cluster_method, n, prune=prune, tmpdir=tmpdir, recalculate=recalculate
                        )

    def concatenate_records(self):
        for p in self.partitions.values():
            p.concatenate_records(self.keys_to_records)
            for concat in p.concats:
                if not concat[0].name in self.concats:
                    self.concats[concat[0].name] = concat

    def autotune(
        self,
        metric,
        prune=True,
        KMeans=True,
        recalculate=True,
        tmpdir=None,
        max_groups=None,
        min_groups=2,
        check_single=True,
    ):
        """
        Uses Perona and Zelnick-Manor's spectral rotation method to determine
        the number of clusters present in the data
        """

        if not tmpdir:
            tmpdir = self.tmpdir
        if not metric in self.get_distance_matrices():
            self.put_distance_matrices(metric, tmpdir=tmpdir)
        dm = self.get_distance_matrices()[metric]

        if check_single and min_groups > 1:
            print "Checking for single cluster..."
            (partition_vector, nclusters, quality_scores) = self.Clustering.run_spectral_rotate(
                dm, prune=prune, KMeans=KMeans, recalculate=recalculate, max_groups=6, min_groups=1, verbose=False
            )
            if nclusters == 1:
                print "Single cluster found."
                print "Quality Scores: {0}".format(quality_scores)

                self.clusters_to_partitions[(metric, "rotate", nclusters)] = partition_vector
                self.partitions[partition_vector] = Partition(partition_vector)
                return (partition_vector, quality_scores)
            else:
                print ">1 clusters found."
                print "Quality Scores: {0}".format(quality_scores)
                recalculate = False

        (partition_vector, nclusters, quality_scores) = self.Clustering.run_spectral_rotate(
            dm, prune=prune, KMeans=KMeans, recalculate=recalculate, max_groups=max_groups, min_groups=min_groups
        )

        self.clusters_to_partitions[(metric, "rotate", nclusters)] = partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)
        return (partition_vector, quality_scores)

    def put_cluster_trees(
        self,
        program="treecollection",
        model=None,
        datatype=None,
        ncat=4,
        optimise="n",
        tmpdir="/tmp",
        overwrite=True,
        max_guide_trees=True,
    ):

        if program not in ["treecollection", "raxml", "phyml", "bionj"]:
            print "unrecognised program {0}".format(program)
            return
        if program == "treecollection":
            return self._put_best_TC_trees(tmpdir=tmpdir, overwrite=overwrite, max_guide_trees=max_guide_trees)
        rec_list = self.get_cluster_records()
        print "Inferring {0} cluster trees".format(len(rec_list))
        self.put_trees(
            rec_list=rec_list,
            program=program,
            model=model,
            ncat=ncat,
            optimise=optimise,
            datatype=datatype,
            tmpdir=tmpdir,
            overwrite=overwrite,
        )
        self.update_scores()

    def _put_best_TC_trees(self, tmpdir="/tmp", overwrite=True, max_guide_trees=-1):
        rec_list = self.get_cluster_records_with_memberships()
        for (rec, members) in rec_list:
            print "Calculating treecollection tree for {0}".format(rec.name),
            if rec.name in self.inferred_trees and overwrite == False:
                print "Skipping - already calculated (overwrite set to False)"
                continue
            guidetrees = [self.keys_to_records[member].tree for member in members]
            if max_guide_trees > 0:
                guidetrees = guidetrees[:max_guide_trees]
            TCtrees = []
            pref = rec._write_temp_tc(make_guide_tree=False, tmpdir=tmpdir)
            pref = "{0}/{1}".format(tmpdir, pref)
            dv_file = pref + "_dv.txt"
            labels_file = pref + "_labels.txt"
            map_file = pref + "_map.txt"
            if len(guidetrees) > 1:
                print "(using best of {0} guidetrees)".format(len(guidetrees))
            else:
                print "(using single guidetree)"
            for t in guidetrees:
                guidetree_file = "{0}/{1}.nwk".format(tmpdir, t.name)
                n = t.reroot_newick()
                with open(guidetree_file, "w") as writer:
                    writer.write(n)
                TCtrees.append(Tree.new_treecollection_tree(dv_file, map_file, labels_file, guidetree_file, rec.name))
            best = min(TCtrees, key=lambda x: x.score)
            rec.tree = best
            self.inferred_trees[rec.name] = best
        self.update_scores()

    def update_scores(self):
        for partition in self.partitions.values():
            partition.update_score(self.concats)

    def _pivot(lst):
        new_lst = zip(*lst)
        return ["".join(x) for x in new_lst]

    def concatenate_list_of_records(self, records=None):
        if not records:
            records = self.get_records()
        concat = copy.deepcopy(records[0])
        for rec in records[1:]:
            concat += rec
        return concat

    def make_randomised_copy(self, tmpdir=None, get_distances=False, parallel_load=False, overwrite=True):

        shuffled_records = self.get_randomised_alignments()
        if not tmpdir:
            tmpdir = self.tmpdir
        randomised_copy = SequenceCollection(
            input_dir=None,
            records=shuffled_records,
            file_format=self.file_format,
            datatype=self.datatype,
            helper=self.helper,
            tmpdir=tmpdir,
            get_distances=get_distances,
            parallel_load=parallel_load,
            overwrite=overwrite,
        )
        return randomised_copy

    def show_memberships(self):

        partitions = self.get_partitions()
        for compound_key in partitions:
            print " ".join(str(x) for x in compound_key)
            partition = partitions[compound_key]
            print partition
            print self.clustering.get_memberships(partition)

    def simulate_from_record(
        self, record, output_dir, name, tmpdir, datatype=None, allow_nonsense=False, split_lengths=None, gene_names=None
    ):

        if not datatype:
            datatype = self.datatype
        if datatype == "protein":
            SeqSim.simulate_from_record_WAG(record, output_dir, name, tmpdir, allow_nonsense, split_lengths, gene_names)
        elif datatype == "dna":
            SeqSim.simulate_from_record_GTR(record, output_dir, name, tmpdir, allow_nonsense, split_lengths, gene_names)
        else:
            print "datatype {0} is not recognised".format(datatype)

    def simulate_from_result(self, key, output_dir, name, tmpdir, datatype=None, allow_nonsense=False):

        if not datatype:
            datatype = self.datatype
        p = self.get_partition(key)
        for c in p.concats:
            updated_record = self.concats[c.name][0]  # bug: records in Partition

            # objects aren't linked
            # to trees

            members = c.name.split("-")
            lengths = [self.keys_to_records[int(x)].seqlength for x in members]
            names = ["sim" + self.keys_to_records[int(x)].name for x in members]
            self.simulate_from_record(
                updated_record,
                output_dir,
                name=name,
                tmpdir=tmpdir,
                allow_nonsense=allow_nonsense,
                split_lengths=lengths,
                gene_names=names,
            )

    #######################
    # Getters
    #######################

    def get_trees(self):
        return [rec.tree for rec in self.get_records()]

    def get_cluster_records(self):
        """
        Returns all concatenated records from cluster analysis
        """

        sort_key = lambda item: tuple(
            (int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item[0].name)
        )
        return [rec for (rec, _) in sorted(self.concats.values(), key=sort_key)]

    def get_cluster_records_with_memberships(self):
        """
        Returns all concatenated records from cluster analysis
        """

        sort_key = lambda item: tuple(
            (int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item[0].name)
        )
        return sorted(self.concats.values(), key=sort_key)

    def get_cluster_trees(self):
        records = self.get_cluster_records()
        trees = [rec.tree for rec in records]
        return trees

    def get_score(self, key):
        return self.get_partition(key).score

    def get_partition(self, key):
        partition_vector = self.clusters_to_partitions[key]
        return self.partitions[partition_vector]

    def get_membership(self, key, flatten=False):
        return self.get_partition(key).get_membership(flatten=flatten)

    def get_partitions(self):
        return [(k, self.partitions[v]) for (k, v) in self.clusters_to_partitions.items()]

    def get_memberships(self, flatten=False):
        return [
            (k, self.partitions[v].get_membership(flatten=flatten)) for (k, v) in self.clusters_to_partitions.items()
        ]

    def get_scores(self):
        return [(k, self.partitions[v].score) for (k, v) in self.clusters_to_partitions.items()]

    def get_randomised_alignments(self):
        lengths = [rec.seqlength for rec in self.get_records()]
        names = self.get_names()
        datatype = self.records[0].datatype
        concat = self.concatenate_list_of_records()
        concat.shuffle()
        newrecs = concat.split_by_lengths(lengths, names)
        return newrecs

    def get_records(self):
        """
        Returns list of stored sequence records
        """

        return [self.keys_to_records[i] for i in range(self.length)]

    def get_names(self):
        """
        Returns a list of the names of the stored records
        """

        return [rec.name for rec in self.get_records()]

    def get_seqlengths(self):
        """
        Returns a list of the sequence lengths of the stored records
        """

        return [rec.seqlength for rec in self.get_records()]

    def get_distance_matrices(self):
        return self.distance_matrices

    def get_dv_matrices(self):
        dvs = {}
        for rec in self.get_records():
            dvs[rec.name] = rec.dv
        return dvs

    #########################
    # Plotters
    #########################

    def plot_dendrogram(self, metric, link, nclasses, show=True):

        plot_object = self.clustering.plot_dendrogram((metric, link, nclasses))
        if show:
            plot_object.show()
        return plot_object

    def plot_heatmap(self, distance_matrix, partition, outfile=None):

        sort_partition = partition.get_membership(flatten=True)
        fig = distance_matrix.plot_heatmap(sort_partition=sort_partition)
        if outfile:
            fig.savefig("{0}.pdf".format(outfile))
        return fig

    def plot_embedding(
        self,
        partition_vector,
        distance_matrix,
        embedding="MDS",
        prune=True,
        dimensions=3,
        centre_of_mass=False,
        outfile=None,
        standardize=False,
        normalise=False,
        annotate=False,
    ):
        """
        Plots an embedding of the trees in a Principal Coordinate space,
        and saves as pdf.
        """

        dm = distance_matrix.matrix
        partition_vector = np.array(partition_vector)
        labels = self.get_names()
        if embedding == "MDS":
            dbc = self.Clustering.get_double_centre(dm)
            (vals, vecs, var_exp) = self.Clustering.get_eigen(dbc, standardize=standardize)
            (coords, _) = self.Clustering.get_coords_by_dimension(vals, vecs, var_exp, 3, normalise=normalise)
        elif embedding == "spectral":
            laplacian = self.Clustering.spectral(dm, prune=prune)

            (vals, vecs, var_exp) = self.Clustering.get_eigen(laplacian, standardize=standardize)
            (coords, _) = self.Clustering.get_coords_by_dimension(vals, vecs, var_exp, 3, normalise=normalise)
        else:
            print "embedding should be one of 'MDS' or 'spectral'"
            print "value given was:", embedding
            return
        min_Z = min([z for (x, y, z) in coords])
        P = []  # get the indices of the partition vector for each group

        # and store in this list

        max_groups = max(partition_vector)
        for i in range(1, max_groups + 1):
            partition = np.where(partition_vector == i)
            P.append(partition)

        colors = "bgrcmyk"
        coldict = {"b": "blue", "g": "green", "r": "red", "c": "cyan", "m": "magenta", "y": "yellow", "k": "black"}
        fig2d = plt.figure()
        fig3d = plt.figure()
        ax2d = fig2d.add_subplot(111)
        ax3d = fig3d.add_subplot(111, projection="3d")

        for (pos, partition) in enumerate(P):
            for i in partition[0]:
                ax2d.scatter(color=colors[pos % len(colors)], *(coords[i])[:2])
                ax3d.scatter(color=colors[pos % len(colors)], *coords[i])
                ax3d.plot(
                    [coords[i][0], coords[i][0]],
                    [coords[i][1], coords[i][1]],
                    [min_Z, coords[i][2]],
                    color="grey",
                    linewidth=0.2,
                )

                if annotate:
                    ax2d.annotate(
                        labels[i],
                        xy=(coords[i][0], coords[i][1]),
                        xytext=(-20, 20),
                        textcoords="offset points",
                        fontsize="x-small",
                        ha="right",
                        va="bottom",
                        bbox=dict(boxstyle="round,pad=0.5", fc="yellow", alpha=0.5),
                        arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0"),
                    )

            if centre_of_mass:
                com = np.mean(coords[partition], axis=0)
                ax2d.scatter(color="k", marker="x", s=2, *com[:2])
                ax3d.scatter(color="k", marker="x", s=2, *com)
        if embedding == "spectral" and normalise:
            (u, v) = np.mgrid[0 : 2 * np.pi : 20j, 0 : np.pi : 10j]
            x = np.cos(u) * np.sin(v)
            y = np.sin(u) * np.sin(v)
            z = np.cos(v)
            ax3d.plot_wireframe(x, y, z, color="grey", linewidth=0.2)

        ax2d.set_xlabel("PCo1")
        ax2d.set_ylabel("PCo2")
        ax2d.set_title("Trees embedded in dimension-reduced space")
        ax3d.set_xlabel("PCo1")
        ax3d.set_ylabel("PCo2")
        ax3d.set_zlabel("PCo3")
        ax3d.set_title("Trees embedded in dimension-reduced space")
        if outfile:
            fig2d.savefig("{0}-2d.pdf".format(outfile))
            fig3d.savefig("{0}-3d.pdf".format(outfile))
        return (fig2d, fig3d)

    #########################
    # Parallelisers
    #########################

    def _unpack_dv(self, packed_args):
        return packed_args[0].get_dv_matrix(*packed_args[1:])

    def _dv_parallel_call(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True):

        nprocesses = min(self.length, multiprocessing.cpu_count() - 1)
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, self.length)
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in self.get_records():
            new_dir = tmpdir + "/" + rec.name
            if not os.path.isdir(new_dir):
                os.mkdir(new_dir)
            args.append((rec, tmpdir + "/" + rec.name, helper, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_dv, args, callback=results.append)
        r.wait()
        for (w, x, y, z) in args:
            if os.path.isdir(x):
                os.rmdir(x)
        results = results[0]
        print "Results obtained, closing pool..."
        pool.close()
        pool.join()
        print "Pool closed"
        return dict(zip(names, results))

    def put_dv_matrices_parallel(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True):

        dv_matrices_dict = self._dv_parallel_call(tmpdir, helper, overwrite=overwrite)
        for rec in self.get_records():
            rec.dv = [dv_matrices_dict[rec.name]]

    def _unpack_bionj(self, packed_args):
        return packed_args[0].get_bionj_tree(*packed_args[1:])

    def _bionj_parallel_call(self, model=None, datatype=None, rec_list=None, ncat=1, tmpdir="/tmp", overwrite=True):

        if not rec_list:
            rec_list = self.records
        nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1)
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, model, datatype, ncat, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_bionj, args, callback=results.append)
        r.wait()
        print "Results obtained, closing pool..."
        pool.close()
        pool.join()
        print "Pool closed"
        return dict(zip(names, results[0]))

    def _unpack_phyml(self, packed_args):
        return packed_args[0].get_phyml_tree(*packed_args[1:])

    def _phyml_parallel_call(self, model=None, datatype=None, rec_list=None, ncat=4, tmpdir="/tmp", overwrite=True):

        if not rec_list:
            rec_list = self.records
        nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1)
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, model, datatype, ncat, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_phyml, args, callback=results.append)
        r.wait()
        print "Results obtained, closing pool..."
        pool.close()
        pool.join()
        print "Pool closed"
        return dict(zip(names, results[0]))

    def _unpack_raxml(self, packed_args):
        return packed_args[0].get_raxml_tree(*packed_args[1:])

    def _raxml_parallel_call(self, rec_list=None, tmpdir="/tmp", overwrite=True):

        if not rec_list:
            rec_list = self.records
        nprocesses = multiprocessing.cpu_count() - 1
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_raxml, args, callback=results.append)
        r.wait()
        pool.close()
        pool.join()
        return dict(zip(names, results[0]))

    def _unpack_TC(self, packed_args):
        return packed_args[0].get_TC_tree(*packed_args[1:])

    def _TC_parallel_call(self, rec_list=None, tmpdir="/tmp", overwrite=True):

        if not rec_list:
            rec_list = self.records
        nprocesses = multiprocessing.cpu_count() - 1
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_TC, args, callback=results.append)
        r.wait()
        pool.close()
        pool.join()
        return dict(zip(names, results[0]))

    def put_trees_parallel(
        self, rec_list=None, program="treecollection", model=None, datatype=None, ncat=4, tmpdir="/tmp", overwrite=True
    ):

        if not program in ["treecollection", "raxml", "phyml", "bionj"]:
            print "unrecognised program {0}".format(program)
            return
        if not rec_list:
            rec_list = self.records
        if program == "treecollection":
            trees_dict = self._TC_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite)
        elif program == "raxml":
            trees_dict = self._raxml_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite)
        elif program == "phyml":
            trees_dict = self._phyml_parallel_call(
                rec_list=rec_list, model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite
            )
        elif program == "bionj":
            trees_dict = self._bionj_parallel_call(
                rec_list=rec_list, model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite
            )
        for rec in self.get_records():
            rec.tree = trees_dict[rec.name]
            self.inferred_trees[rec.name] = trees_dict[rec.name]

    def put_cluster_trees_parallel(
        self, program="treecollection", model=None, datatype=None, ncat=4, tmpdir="/tmp", overwrite=True
    ):

        if program not in ["treecollection", "raxml", "phyml", "bionj"]:
            print "unrecognised program {0}".format(program)
            return
        rec_list = self.get_cluster_records()
        print "Inferring {0} cluster trees".format(len(rec_list))
        if program == "treecollection":
            cluster_trees_dict = self._TC_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite)
        elif program == "raxml":
            cluster_trees_dict = self._raxml_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite)
        elif program == "phyml":
            cluster_trees_dict = self._phyml_parallel_call(
                rec_list=rec_list, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir, overwrite=overwrite
            )
        elif program == "bionj":
            cluster_trees_dict = self._bionj_parallel_call(
                rec_list=rec_list, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir, overwrite=overwrite
            )
        for rec in rec_list:
            rec.tree = cluster_trees_dict[rec.name]
        self.update_results()
Example #2
0
from clustering import Clustering

cl = Clustering('magia_cluster_data')
#cl = Clustering('data_sample')
#cl.generate_clusters(eps=0.5, min_samples=5)

#cl.recommend_n(5931298, 4)

cl.spectral(8)
class SequenceCollection(object):
    """
    Orchestrating class that should:
    a) work as a central repository for the information generated by the
       subordinate classes, and
    b) be the only class directly interacted with by the user

    TO DO:
    implement consistent naming of methods (where appropriate)
    Prefixes:
    get_[something]  - returns the object implied by something
    put_[something]  - puts something in the class data structure
    show_[something] - prints something to screen
    plot_[something] - displays a plot of something
    _[something]     - private method
    """
    def __init__(
        self,
        input_dir=None,
        records=None,
        file_format='fasta',
        datatype='protein',
        helper='./class_files/DV_wrapper.drw',
        tmpdir='/tmp',
        get_distances=False,
        parallel_load=False,
        overwrite=True,
    ):

        # Unset Variables

        # Store some mappings for data retrieval

        self.records_to_keys = {}
        self.keys_to_records = {}
        self.clusters_to_partitions = {}
        self.partitions = {}
        self.distance_matrices = {}
        self.concats = {}
        self.inferred_trees = {}
        self.Clustering = Clustering()

        # Store some data

        self.files = None
        self.file_format = file_format
        self.datatype = datatype
        self.records = []
        self.length = 0
        self.helper = helper

        # Set Variables

        self.tmpdir = tmpdir

        # Lambda for sorting by name and number

        sort_key = lambda item: tuple(
            (int(num) if num else alpha)
            for (num, alpha) in re.findall(r'(\d+)|(\D+)', item))

        # Can give an input directory as optional argument
        # If given:
        #    read the alignment files
        #    optionally calculate pairwise distances
        #    store the sequence data

        if input_dir:

            files = self.get_files(input_dir, file_format)

            # file checks

            if files == 0:
                print '!!!'
                print 'There was a problem reading files from {0}'.format(
                    input_dir)
                print '!!!'
                sys.exit()

            if get_distances and not os.path.isfile(helper):
                print '!!!'
                print 'There was a problem finding the darwin helper at {0}'.format(
                    helper)
                print '!!!'
                sys.exit()

            # done

            files.sort(key=sort_key)
            self.put_records(files=files,
                             record_list=None,
                             file_format=file_format,
                             datatype=datatype)

            # takes care of self.length for us

            self.sanitise_records()
            if not os.path.isdir(tmpdir):
                os.mkdir(tmpdir)
        elif records:

            # Can optionally give record objects directly if no input dir specified

            self.put_records(files=None,
                             record_list=records,
                             file_format=file_format,
                             datatype=datatype)

            # takes care of self.length for us

            self.sanitise_records()

        # Optionally use Darwin to calculate pairwise distances

        if get_distances and self.records:
            if parallel_load:
                self.put_dv_matrices_parallel(helper=helper,
                                              tmpdir=tmpdir,
                                              overwrite=overwrite)
            else:
                self.put_dv_matrices(helper=helper,
                                     tmpdir=tmpdir,
                                     overwrite=overwrite)

    def __str__(self):
        s = 'SequenceCollection object:\n'
        s += 'Contains {0} alignments\n'.format(self.length)
        return s

    def __len__(self):
        return self.length

    def get_files(self, input_dir, file_format='fasta'):
        """
        Get list of alignment files from an input directory
        *.fa, *.fas and *.phy files only
        Stores in self.files
        """

        if file_format == 'fasta':
            files = glob.glob('{0}/*.fa'.format(input_dir))
            if len(files) == 0:
                files = glob.glob('{0}/*.fas'.format(input_dir))
        elif file_format == 'phylip':
            files = glob.glob('{0}/*.phy'.format(input_dir))
        else:
            print 'Unrecognised file format %s' % file_format
            files = None
        if not files:
            print 'No sequence files found in {0}'.format(input_dir)
            return 0
        return sorted(files)

    def dump_records(
        self,
        output_dir,
        records=None,
        file_format='phylip',
        use_hashname=True,
    ):
        """
        Dumps all sequence alignment records to an output directory
        Files are dumped in sequential phylip format; by default the
        names are hashed
        """

        directorycheck_and_make(output_dir)

        hash_translation = {}

        if not records:
            records = self.get_records()

        for rec in records:
            filename = rec._write_temp_phylip(output_dir,
                                              use_hashname=use_hashname)
            try:
                hash_translation[str(rec.name)] = filename
            except TypeError:
                print type(rec.name), rec.name, type(filename), filename
        cPickle.dump(hash_translation,
                     open('{0}/hash_translation.pkl'.format(output_dir), 'w'))

    def hash(self, string):
        H = hashlib.sha1(string)
        return H.hexdigest()

    def gzip(self, filename):

        if not filename.endswith('.gz'):
            filename += '.gz'

        cPickle.dump(self, file=gz.open(filename, 'wb'), protocol=-1)

    @classmethod
    def gunzip(cls, filename):

        return cPickle.load(gz.open(filename, 'rb'))

    def put_records(
        self,
        files=None,
        record_list=None,
        file_format='fasta',
        datatype='protein',
    ):
        """
        Reads sequence files from the list generated by
        get_files and stores in self.records
        """

        get_name = lambda i: i[i.rindex('/') + 1:i.rindex('.')]

        if files and not record_list:
            record_list = [
                TCSeqRec(f,
                         file_format=file_format,
                         name=get_name(f),
                         datatype=datatype) for f in files
            ]
        elif not files and not record_list:

            print 'Can\'t load records - no records or alignment files given'
            return

        records_to_keys = dict([(record.name, number)
                                for (number, record) in enumerate(record_list)
                                ])
        keys_to_records = dict(enumerate(record_list))
        self.records = record_list
        self.length = len(record_list)
        self.records_to_keys = records_to_keys
        self.keys_to_records = keys_to_records

    def load_phyml_results(
        self,
        input_dir,
        records=None,
        use_hashname=False,
        program='phyml',
    ):

        if not records:
            records = self.get_records()
        failures = []
        for rec in records:
            if use_hashname:
                name = rec.hashname()
            else:
                name = rec.name
            tree_file = '{0}/{1}.phy_phyml_tree.txt'.format(input_dir, name)
            stats_file = \
                '{0}/{1}.phy_phyml_stats.txt'.format(input_dir, name)

            try:
                rec.tree.load_phyml_results(tree_file,
                                            stats_file,
                                            name=rec.name,
                                            program=program)
            except FileError:
                failures.append(rec.name)

        if failures:
            print 'Couldn\'t load results for the following records:'
            for f in failures:
                print '   ', f

    def sanitise_records(self):
        """
        Sorts records alphabetically, trims whitespace from beginning
        of record headers, removes '/' characters from headers,
        replaces spaces with underscores, puts sequences into upper case
        """

        for rec in self.get_records():
            rec.sanitise()

    def put_dv_matrices(
        self,
        tmpdir='/tmp',
        helper='./class_files/DV_wrapper.drw',
        overwrite=True,
    ):

        for rec in self.get_records():
            rec.dv = [
                rec.get_dv_matrix(tmpdir=tmpdir,
                                  helper=helper,
                                  overwrite=overwrite)
            ]

    def put_trees(
        self,
        rec_list=None,
        program='treecollection',
        model=None,
        datatype=None,
        ncat=4,
        optimise='n',
        tmpdir=None,
        overwrite=True,
        verbose=False,
    ):

        if tmpdir is None:
            tmpdir = self.tmpdir
        if not program in ['treecollection', 'raxml', 'phyml', 'bionj']:
            print 'unrecognised program {0}'.format(program)
            return
        if not rec_list:
            rec_list = self.records
        for rec in rec_list:
            if overwrite is False:
                if rec.name in self.inferred_trees:
                    continue
            if program == 'treecollection':
                tree = rec.get_TC_tree(tmpdir=tmpdir, overwrite=overwrite)
            elif program == 'raxml':
                tree = rec.get_raxml_tree(tmpdir=tmpdir, overwrite=overwrite)
            elif program == 'phyml':
                tree = rec.get_phyml_tree(
                    model=model,
                    datatype=datatype,
                    tmpdir=tmpdir,
                    ncat=ncat,
                    overwrite=overwrite,
                    verbose=verbose,
                )
            elif program == 'bionj':
                tree = rec.get_bionj_tree(
                    model=model,
                    datatype=datatype,
                    tmpdir=tmpdir,
                    ncat=ncat,
                    optimise=optimise,
                    overwrite=overwrite,
                    verbose=verbose,
                )
            self.inferred_trees[rec.name] = tree

    def put_distance_matrices(
        self,
        metrics,
        tmpdir='/tmp',
        normalise=False,
    ):
        """
        Pass this function a list of metrics
        valid kwargs - invert (bool), normalise (bool)
        """

        if not isinstance(metrics, list):
            metrics = [metrics]
        trees = [rec.tree for rec in self.get_records()]
        for metric in metrics:
            dm = DistanceMatrix(trees, tmpdir=tmpdir)
            dm.get_distance_matrix(metric, normalise=normalise)
            self.distance_matrices[metric] = dm

    def put_partition(
        self,
        metric,
        cluster_method,
        nclusters,
        prune=True,
        tmpdir=None,
        recalculate=False,
    ):

        if not tmpdir:
            tmpdir = self.tmpdir
        if not metric in self.get_distance_matrices():
            self.put_distance_matrices(metric, tmpdir=tmpdir)
        partition_vector = \
            self.Clustering.run_clustering(self.distance_matrices[metric],
                cluster_method, nclusters, prune=prune,
                recalculate=recalculate)

        self.clusters_to_partitions[(metric, cluster_method,
                                     nclusters)] = partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)
        return partition_vector

    def put_partition_vector(self, partition_vector, name):
        """
        Given a partition vector (i.e. a tuple containing the class-
        membership for each gene alignment), inserts the relevant data
        structures into the SequenceCollection object.
        NEXT: run concatenate_records(), put_cluster_trees()
        """

        self.clusters_to_partitions[name] = partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)

    def put_partitions(
        self,
        metrics,
        cluster_methods,
        nclusters,
        prune=True,
        tmpdir=None,
        recalculate=False,
    ):
        """
        metrics, linkages and nclasses are given as lists, or coerced into
        lists
        """

        if not isinstance(metrics, list):
            metrics = [metrics]
        if not isinstance(cluster_methods, list):
            cluster_methods = [cluster_methods]
        if not isinstance(nclusters, list):
            nclusters = [nclusters]
        if tmpdir is None:
            tmpdir = self.tmpdir
        else:
            nclusters = sorted(nclusters, reverse=True)

        # names = [rec.name for rec in self.get_records()]

        for metric in metrics:
            print 'Clustering {0} data'.format(metric)
            self.Clustering.clear_cache()
            for cluster_method in cluster_methods:
                print ' ', cluster_method
                for n in nclusters:
                    key = (metric, cluster_method, n)
                    if key in self.clusters_to_partitions:
                        continue
                    else:
                        self.put_partition(
                            metric,
                            cluster_method,
                            n,
                            prune=prune,
                            tmpdir=tmpdir,
                            recalculate=recalculate,
                        )

    def concatenate_records(self):
        for p in self.partitions.values():
            p.concatenate_records(self.keys_to_records)
            for concat in p.concats:
                if not concat[0].name in self.concats:
                    self.concats[concat[0].name] = concat

    def autotune(
        self,
        metric,
        prune=True,
        KMeans=True,
        recalculate=True,
        tmpdir=None,
        max_groups=None,
        min_groups=2,
        check_single=True,
    ):
        """
        Uses Perona and Zelnick-Manor's spectral rotation method to determine
        the number of clusters present in the data
        """

        if not tmpdir:
            tmpdir = self.tmpdir
        if not metric in self.get_distance_matrices():
            self.put_distance_matrices(metric, tmpdir=tmpdir)
        dm = self.get_distance_matrices()[metric]

        if check_single and min_groups > 1:
            print 'Checking for single cluster...'
            (partition_vector, nclusters, quality_scores) = \
                self.Clustering.run_spectral_rotate(
                dm,
                prune=prune,
                KMeans=KMeans,
                recalculate=recalculate,
                max_groups=6,
                min_groups=1,
                verbose=False,
                )
            if nclusters == 1:
                print 'Single cluster found.'
                print 'Quality Scores: {0}'.format(quality_scores)

                self.clusters_to_partitions[(metric, 'rotate',
                                             nclusters)] = partition_vector
                self.partitions[partition_vector] = \
                    Partition(partition_vector)
                return (partition_vector, quality_scores)
            else:
                print '>1 clusters found.'
                print 'Quality Scores: {0}'.format(quality_scores)
                recalculate = False

        (partition_vector, nclusters, quality_scores) = \
            self.Clustering.run_spectral_rotate(
            dm,
            prune=prune,
            KMeans=KMeans,
            recalculate=recalculate,
            max_groups=max_groups,
            min_groups=min_groups,
            )

        self.clusters_to_partitions[(metric, 'rotate', nclusters)] = \
            partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)
        return (partition_vector, quality_scores)

    def put_cluster_trees(
        self,
        program='treecollection',
        model=None,
        datatype=None,
        ncat=4,
        optimise='n',
        tmpdir='/tmp',
        overwrite=True,
        max_guide_trees=True,
    ):

        if program not in ['treecollection', 'raxml', 'phyml', 'bionj']:
            print 'unrecognised program {0}'.format(program)
            return
        if program == 'treecollection':
            return self._put_best_TC_trees(tmpdir=tmpdir,
                                           overwrite=overwrite,
                                           max_guide_trees=max_guide_trees)
        rec_list = self.get_cluster_records()
        print 'Inferring {0} cluster trees'.format(len(rec_list))
        self.put_trees(
            rec_list=rec_list,
            program=program,
            model=model,
            ncat=ncat,
            optimise=optimise,
            datatype=datatype,
            tmpdir=tmpdir,
            overwrite=overwrite,
        )
        self.update_scores()

    def _put_best_TC_trees(
        self,
        tmpdir='/tmp',
        overwrite=True,
        max_guide_trees=-1,
    ):
        rec_list = self.get_cluster_records_with_memberships()
        for (rec, members) in rec_list:
            print 'Calculating treecollection tree for {0}'.format(rec.name),
            if rec.name in self.inferred_trees and overwrite == False:
                print 'Skipping - already calculated (overwrite set to False)'
                continue
            guidetrees = [
                self.keys_to_records[member].tree for member in members
            ]
            if max_guide_trees > 0:
                guidetrees = guidetrees[:max_guide_trees]
            TCtrees = []
            pref = rec._write_temp_tc(make_guide_tree=False, tmpdir=tmpdir)
            pref = '{0}/{1}'.format(tmpdir, pref)
            dv_file = pref + '_dv.txt'
            labels_file = pref + '_labels.txt'
            map_file = pref + '_map.txt'
            if len(guidetrees) > 1:
                print '(using best of {0} guidetrees)'.format(len(guidetrees))
            else:
                print '(using single guidetree)'
            for t in guidetrees:
                guidetree_file = '{0}/{1}.nwk'.format(tmpdir, t.name)
                n = t.reroot_newick()
                with open(guidetree_file, 'w') as writer:
                    writer.write(n)
                TCtrees.append(
                    Tree.new_treecollection_tree(dv_file, map_file,
                                                 labels_file, guidetree_file,
                                                 rec.name))
            best = min(TCtrees, key=lambda x: x.score)
            rec.tree = best
            self.inferred_trees[rec.name] = best
        self.update_scores()

    def update_scores(self):
        for partition in self.partitions.values():
            partition.update_score(self.concats)

    def _pivot(lst):
        new_lst = zip(*lst)
        return [''.join(x) for x in new_lst]

    def concatenate_list_of_records(self, records=None):
        if not records:
            records = self.get_records()
        concat = copy.deepcopy(records[0])
        for rec in records[1:]:
            concat += rec
        return concat

    def make_randomised_copy(
        self,
        tmpdir=None,
        get_distances=False,
        parallel_load=False,
        overwrite=True,
    ):

        shuffled_records = self.get_randomised_alignments()
        if not tmpdir:
            tmpdir = self.tmpdir
        randomised_copy = SequenceCollection(
            input_dir=None,
            records=shuffled_records,
            file_format=self.file_format,
            datatype=self.datatype,
            helper=self.helper,
            tmpdir=tmpdir,
            get_distances=get_distances,
            parallel_load=parallel_load,
            overwrite=overwrite,
        )
        return randomised_copy

    def show_memberships(self):

        partitions = self.get_partitions()
        for compound_key in partitions:
            print ' '.join(str(x) for x in compound_key)
            partition = partitions[compound_key]
            print partition
            print self.clustering.get_memberships(partition)

    def simulate_from_record(
        self,
        record,
        output_dir,
        name,
        tmpdir,
        datatype=None,
        allow_nonsense=False,
        split_lengths=None,
        gene_names=None,
    ):

        if not datatype:
            datatype = self.datatype
        if datatype == 'protein':
            SeqSim.simulate_from_record_WAG(
                record,
                output_dir,
                name,
                tmpdir,
                allow_nonsense,
                split_lengths,
                gene_names,
            )
        elif datatype == 'dna':
            SeqSim.simulate_from_record_GTR(
                record,
                output_dir,
                name,
                tmpdir,
                allow_nonsense,
                split_lengths,
                gene_names,
            )
        else:
            print 'datatype {0} is not recognised'.format(datatype)

    def simulate_from_result(
        self,
        key,
        output_dir,
        name,
        tmpdir,
        datatype=None,
        allow_nonsense=False,
    ):

        if not datatype:
            datatype = self.datatype
        p = self.get_partition(key)
        for c in p.concats:
            updated_record = self.concats[c.name][
                0]  # bug: records in Partition

            # objects aren't linked
            # to trees

            members = c.name.split('-')
            lengths = [self.keys_to_records[int(x)].seqlength for x in members]
            names = [
                'sim' + self.keys_to_records[int(x)].name for x in members
            ]
            self.simulate_from_record(
                updated_record,
                output_dir,
                name=name,
                tmpdir=tmpdir,
                allow_nonsense=allow_nonsense,
                split_lengths=lengths,
                gene_names=names,
            )

#######################
# Getters
#######################

    def get_trees(self):
        return [rec.tree for rec in self.get_records()]

    def get_cluster_records(self):
        """
        Returns all concatenated records from cluster analysis
        """

        sort_key = lambda item: tuple(
            (int(num) if num else alpha)
            for (num, alpha) in re.findall(r'(\d+)|(\D+)', item[0].name))
        return [
            rec for (rec, _) in sorted(self.concats.values(), key=sort_key)
        ]

    def get_cluster_records_with_memberships(self):
        """
        Returns all concatenated records from cluster analysis
        """

        sort_key = lambda item: tuple(
            (int(num) if num else alpha)
            for (num, alpha) in re.findall(r'(\d+)|(\D+)', item[0].name))
        return sorted(self.concats.values(), key=sort_key)

    def get_cluster_trees(self):
        records = self.get_cluster_records()
        trees = [rec.tree for rec in records]
        return trees

    def get_score(self, key):
        return self.get_partition(key).score

    def get_partition(self, key):
        partition_vector = self.clusters_to_partitions[key]
        return self.partitions[partition_vector]

    def get_membership(self, key, flatten=False):
        return self.get_partition(key).get_membership(flatten=flatten)

    def get_partitions(self):
        return [(k, self.partitions[v])
                for (k, v) in self.clusters_to_partitions.items()]

    def get_memberships(self, flatten=False):
        return [(k, self.partitions[v].get_membership(flatten=flatten))
                for (k, v) in self.clusters_to_partitions.items()]

    def get_scores(self):
        return [(k, self.partitions[v].score)
                for (k, v) in self.clusters_to_partitions.items()]

    def get_randomised_alignments(self):
        lengths = [rec.seqlength for rec in self.get_records()]
        names = self.get_names()
        datatype = self.records[0].datatype
        concat = self.concatenate_list_of_records()
        concat.shuffle()
        newrecs = concat.split_by_lengths(lengths, names)
        return newrecs

    def get_records(self):
        """
        Returns list of stored sequence records
        """

        return [self.keys_to_records[i] for i in range(self.length)]

    def get_names(self):
        """
        Returns a list of the names of the stored records
        """

        return [rec.name for rec in self.get_records()]

    def get_seqlengths(self):
        """
        Returns a list of the sequence lengths of the stored records
        """

        return [rec.seqlength for rec in self.get_records()]

    def get_distance_matrices(self):
        return self.distance_matrices

    def get_dv_matrices(self):
        dvs = {}
        for rec in self.get_records():
            dvs[rec.name] = rec.dv
        return dvs

#########################
# Plotters
#########################

    def plot_dendrogram(
        self,
        metric,
        link,
        nclasses,
        show=True,
    ):

        plot_object = self.clustering.plot_dendrogram((metric, link, nclasses))
        if show:
            plot_object.show()
        return plot_object

    def plot_heatmap(
        self,
        distance_matrix,
        partition,
        outfile=None,
    ):

        sort_partition = partition.get_membership(flatten=True)
        fig = \
            distance_matrix.plot_heatmap(sort_partition=sort_partition)
        if outfile:
            fig.savefig('{0}.pdf'.format(outfile))
        return fig

    def plot_embedding(
        self,
        partition_vector,
        distance_matrix,
        embedding='MDS',
        prune=True,
        dimensions=3,
        centre_of_mass=False,
        outfile=None,
        standardize=False,
        normalise=False,
        annotate=False,
    ):
        """
        Plots an embedding of the trees in a Principal Coordinate space,
        and saves as pdf.
        """

        dm = distance_matrix.matrix
        partition_vector = np.array(partition_vector)
        labels = self.get_names()
        if embedding == 'MDS':
            dbc = self.Clustering.get_double_centre(dm)
            (vals, vecs,
             var_exp) = self.Clustering.get_eigen(dbc, standardize=standardize)
            (coords,
             _) = self.Clustering.get_coords_by_dimension(vals,
                                                          vecs,
                                                          var_exp,
                                                          3,
                                                          normalise=normalise)
        elif embedding == 'spectral':
            laplacian = self.Clustering.spectral(dm, prune=prune)

            (vals, vecs, var_exp) = \
                self.Clustering.get_eigen(laplacian,
                    standardize=standardize)
            (coords,
             _) = self.Clustering.get_coords_by_dimension(vals,
                                                          vecs,
                                                          var_exp,
                                                          3,
                                                          normalise=normalise)
        else:
            print 'embedding should be one of \'MDS\' or \'spectral\''
            print 'value given was:', embedding
            return
        min_Z = min([z for (x, y, z) in coords])
        P = []  # get the indices of the partition vector for each group

        # and store in this list

        max_groups = max(partition_vector)
        for i in range(1, max_groups + 1):
            partition = np.where(partition_vector == i)
            P.append(partition)

        colors = 'bgrcmyk'
        coldict = {
            'b': 'blue',
            'g': 'green',
            'r': 'red',
            'c': 'cyan',
            'm': 'magenta',
            'y': 'yellow',
            'k': 'black',
        }
        fig2d = plt.figure()
        fig3d = plt.figure()
        ax2d = fig2d.add_subplot(111)
        ax3d = fig3d.add_subplot(111, projection='3d')

        for (pos, partition) in enumerate(P):
            for i in partition[0]:
                ax2d.scatter(color=colors[pos % len(colors)], *(coords[i])[:2])
                ax3d.scatter(color=colors[pos % len(colors)], *coords[i])
                ax3d.plot([coords[i][0], coords[i][0]],
                          [coords[i][1], coords[i][1]], [min_Z, coords[i][2]],
                          color='grey',
                          linewidth=0.2)

                if annotate:
                    ax2d.annotate(
                        labels[i],
                        xy=(coords[i][0], coords[i][1]),
                        xytext=(-20, 20),
                        textcoords='offset points',
                        fontsize='x-small',
                        ha='right',
                        va='bottom',
                        bbox=dict(boxstyle='round,pad=0.5',
                                  fc='yellow',
                                  alpha=0.5),
                        arrowprops=dict(arrowstyle='->',
                                        connectionstyle='arc3,rad=0'),
                    )

            if centre_of_mass:
                com = np.mean(coords[partition], axis=0)
                ax2d.scatter(color='k', marker='x', s=2, *com[:2])
                ax3d.scatter(color='k', marker='x', s=2, *com)
        if embedding == 'spectral' and normalise:
            (u, v) = np.mgrid[0:2 * np.pi:20j, 0:np.pi:10j]
            x = np.cos(u) * np.sin(v)
            y = np.sin(u) * np.sin(v)
            z = np.cos(v)
            ax3d.plot_wireframe(x, y, z, color='grey', linewidth=0.2)

        ax2d.set_xlabel('PCo1')
        ax2d.set_ylabel('PCo2')
        ax2d.set_title('Trees embedded in dimension-reduced space')
        ax3d.set_xlabel('PCo1')
        ax3d.set_ylabel('PCo2')
        ax3d.set_zlabel('PCo3')
        ax3d.set_title('Trees embedded in dimension-reduced space')
        if outfile:
            fig2d.savefig('{0}-2d.pdf'.format(outfile))
            fig3d.savefig('{0}-3d.pdf'.format(outfile))
        return (fig2d, fig3d)

#########################
# Parallelisers
#########################

    def _unpack_dv(self, packed_args):
        return packed_args[0].get_dv_matrix(*packed_args[1:])

    def _dv_parallel_call(
        self,
        tmpdir='/tmp',
        helper='./class_files/DV_wrapper.drw',
        overwrite=True,
    ):

        nprocesses = min(self.length, multiprocessing.cpu_count() - 1)
        print 'Initialising a pool of {0} processes running {1} jobs...'.format(
            nprocesses, self.length)
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in self.get_records():
            new_dir = tmpdir + '/' + rec.name
            if not os.path.isdir(new_dir):
                os.mkdir(new_dir)
            args.append((rec, tmpdir + '/' + rec.name, helper, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_dv, args, callback=results.append)
        r.wait()
        for (w, x, y, z) in args:
            if os.path.isdir(x):
                os.rmdir(x)
        results = results[0]
        print 'Results obtained, closing pool...'
        pool.close()
        pool.join()
        print 'Pool closed'
        return dict(zip(names, results))

    def put_dv_matrices_parallel(
        self,
        tmpdir='/tmp',
        helper='./class_files/DV_wrapper.drw',
        overwrite=True,
    ):

        dv_matrices_dict = self._dv_parallel_call(tmpdir,
                                                  helper,
                                                  overwrite=overwrite)
        for rec in self.get_records():
            rec.dv = [dv_matrices_dict[rec.name]]

    def _unpack_bionj(self, packed_args):
        return packed_args[0].get_bionj_tree(*packed_args[1:])

    def _bionj_parallel_call(
        self,
        model=None,
        datatype=None,
        rec_list=None,
        ncat=1,
        tmpdir='/tmp',
        overwrite=True,
    ):

        if not rec_list:
            rec_list = self.records
        nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1)
        print 'Initialising a pool of {0} processes running {1} jobs...'.format(
            nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((
                rec,
                model,
                datatype,
                ncat,
                tmpdir,
                overwrite,
            ))
            names.append(rec.name)
        r = pool.map_async(self._unpack_bionj, args, callback=results.append)
        r.wait()
        print 'Results obtained, closing pool...'
        pool.close()
        pool.join()
        print 'Pool closed'
        return dict(zip(names, results[0]))

    def _unpack_phyml(self, packed_args):
        return packed_args[0].get_phyml_tree(*packed_args[1:])

    def _phyml_parallel_call(
        self,
        model=None,
        datatype=None,
        rec_list=None,
        ncat=4,
        tmpdir='/tmp',
        overwrite=True,
    ):

        if not rec_list:
            rec_list = self.records
        nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1)
        print 'Initialising a pool of {0} processes running {1} jobs...'.format(
            nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((
                rec,
                model,
                datatype,
                ncat,
                tmpdir,
                overwrite,
            ))
            names.append(rec.name)
        r = pool.map_async(self._unpack_phyml, args, callback=results.append)
        r.wait()
        print 'Results obtained, closing pool...'
        pool.close()
        pool.join()
        print 'Pool closed'
        return dict(zip(names, results[0]))

    def _unpack_raxml(self, packed_args):
        return packed_args[0].get_raxml_tree(*packed_args[1:])

    def _raxml_parallel_call(
        self,
        rec_list=None,
        tmpdir='/tmp',
        overwrite=True,
    ):

        if not rec_list:
            rec_list = self.records
        nprocesses = multiprocessing.cpu_count() - 1
        print 'Initialising a pool of {0} processes running {1} jobs...'.format(
            nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_raxml, args, callback=results.append)
        r.wait()
        pool.close()
        pool.join()
        return dict(zip(names, results[0]))

    def _unpack_TC(self, packed_args):
        return packed_args[0].get_TC_tree(*packed_args[1:])

    def _TC_parallel_call(
        self,
        rec_list=None,
        tmpdir='/tmp',
        overwrite=True,
    ):

        if not rec_list:
            rec_list = self.records
        nprocesses = multiprocessing.cpu_count() - 1
        print 'Initialising a pool of {0} processes running {1} jobs...'.format(
            nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_TC, args, callback=results.append)
        r.wait()
        pool.close()
        pool.join()
        return dict(zip(names, results[0]))

    def put_trees_parallel(
        self,
        rec_list=None,
        program='treecollection',
        model=None,
        datatype=None,
        ncat=4,
        tmpdir='/tmp',
        overwrite=True,
    ):

        if not program in ['treecollection', 'raxml', 'phyml', 'bionj']:
            print 'unrecognised program {0}'.format(program)
            return
        if not rec_list:
            rec_list = self.records
        if program == 'treecollection':
            trees_dict = self._TC_parallel_call(rec_list=rec_list,
                                                tmpdir=tmpdir,
                                                overwrite=overwrite)
        elif program == 'raxml':
            trees_dict = self._raxml_parallel_call(rec_list=rec_list,
                                                   tmpdir=tmpdir,
                                                   overwrite=overwrite)
        elif program == 'phyml':
            trees_dict = self._phyml_parallel_call(
                rec_list=rec_list,
                model=model,
                datatype=datatype,
                tmpdir=tmpdir,
                ncat=ncat,
                overwrite=overwrite,
            )
        elif program == 'bionj':
            trees_dict = self._bionj_parallel_call(
                rec_list=rec_list,
                model=model,
                datatype=datatype,
                tmpdir=tmpdir,
                ncat=ncat,
                overwrite=overwrite,
            )
        for rec in self.get_records():
            rec.tree = trees_dict[rec.name]
            self.inferred_trees[rec.name] = trees_dict[rec.name]

    def put_cluster_trees_parallel(
        self,
        program='treecollection',
        model=None,
        datatype=None,
        ncat=4,
        tmpdir='/tmp',
        overwrite=True,
    ):

        if program not in ['treecollection', 'raxml', 'phyml', 'bionj']:
            print 'unrecognised program {0}'.format(program)
            return
        rec_list = self.get_cluster_records()
        print 'Inferring {0} cluster trees'.format(len(rec_list))
        if program == 'treecollection':
            cluster_trees_dict = \
                self._TC_parallel_call(rec_list=rec_list,
                    tmpdir=tmpdir, overwrite=overwrite)
        elif program == 'raxml':
            cluster_trees_dict = \
                self._raxml_parallel_call(rec_list=rec_list,
                    tmpdir=tmpdir, overwrite=overwrite)
        elif program == 'phyml':
            cluster_trees_dict = self._phyml_parallel_call(
                rec_list=rec_list,
                model=model,
                datatype=datatype,
                ncat=ncat,
                tmpdir=tmpdir,
                overwrite=overwrite,
            )
        elif program == 'bionj':
            cluster_trees_dict = self._bionj_parallel_call(
                rec_list=rec_list,
                model=model,
                datatype=datatype,
                ncat=ncat,
                tmpdir=tmpdir,
                overwrite=overwrite,
            )
        for rec in rec_list:
            rec.tree = cluster_trees_dict[rec.name]
        self.update_results()