class SequenceCollection(object): """ Orchestrating class that should: a) work as a central repository for the information generated by the subordinate classes, and b) be the only class directly interacted with by the user TO DO: implement consistent naming of methods (where appropriate) Prefixes: get_[something] - returns the object implied by something put_[something] - puts something in the class data structure show_[something] - prints something to screen plot_[something] - displays a plot of something _[something] - private method """ def __init__( self, input_dir=None, records=None, file_format="fasta", datatype="protein", helper="./class_files/DV_wrapper.drw", tmpdir="/tmp", get_distances=False, parallel_load=False, overwrite=True, ): # Unset Variables # Store some mappings for data retrieval self.records_to_keys = {} self.keys_to_records = {} self.clusters_to_partitions = {} self.partitions = {} self.distance_matrices = {} self.concats = {} self.inferred_trees = {} self.Clustering = Clustering() # Store some data self.files = None self.file_format = file_format self.datatype = datatype self.records = [] self.length = 0 self.helper = helper # Set Variables self.tmpdir = tmpdir # Lambda for sorting by name and number sort_key = lambda item: tuple((int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item)) # Can give an input directory as optional argument # If given: # read the alignment files # optionally calculate pairwise distances # store the sequence data if input_dir: files = self.get_files(input_dir, file_format) # file checks if files == 0: print "!!!" print "There was a problem reading files from {0}".format(input_dir) print "!!!" sys.exit() if get_distances and not os.path.isfile(helper): print "!!!" print "There was a problem finding the darwin helper at {0}".format(helper) print "!!!" sys.exit() # done files.sort(key=sort_key) self.put_records(files=files, record_list=None, file_format=file_format, datatype=datatype) # takes care of self.length for us self.sanitise_records() if not os.path.isdir(tmpdir): os.mkdir(tmpdir) elif records: # Can optionally give record objects directly if no input dir specified self.put_records(files=None, record_list=records, file_format=file_format, datatype=datatype) # takes care of self.length for us self.sanitise_records() # Optionally use Darwin to calculate pairwise distances if get_distances and self.records: if parallel_load: self.put_dv_matrices_parallel(helper=helper, tmpdir=tmpdir, overwrite=overwrite) else: self.put_dv_matrices(helper=helper, tmpdir=tmpdir, overwrite=overwrite) def __str__(self): s = "SequenceCollection object:\n" s += "Contains {0} alignments\n".format(self.length) return s def __len__(self): return self.length def get_files(self, input_dir, file_format="fasta"): """ Get list of alignment files from an input directory *.fa, *.fas and *.phy files only Stores in self.files """ if file_format == "fasta": files = glob.glob("{0}/*.fa".format(input_dir)) if len(files) == 0: files = glob.glob("{0}/*.fas".format(input_dir)) elif file_format == "phylip": files = glob.glob("{0}/*.phy".format(input_dir)) else: print "Unrecognised file format %s" % file_format files = None if not files: print "No sequence files found in {0}".format(input_dir) return 0 return sorted(files) def dump_records(self, output_dir, records=None, file_format="phylip", use_hashname=True): """ Dumps all sequence alignment records to an output directory Files are dumped in sequential phylip format; by default the names are hashed """ directorycheck_and_make(output_dir) hash_translation = {} if not records: records = self.get_records() for rec in records: filename = rec._write_temp_phylip(output_dir, use_hashname=use_hashname) try: hash_translation[str(rec.name)] = filename except TypeError: print type(rec.name), rec.name, type(filename), filename cPickle.dump(hash_translation, open("{0}/hash_translation.pkl".format(output_dir), "w")) def hash(self, string): H = hashlib.sha1(string) return H.hexdigest() def gzip(self, filename): if not filename.endswith(".gz"): filename += ".gz" cPickle.dump(self, file=gz.open(filename, "wb"), protocol=-1) @classmethod def gunzip(cls, filename): return cPickle.load(gz.open(filename, "rb")) def put_records(self, files=None, record_list=None, file_format="fasta", datatype="protein"): """ Reads sequence files from the list generated by get_files and stores in self.records """ get_name = lambda i: i[i.rindex("/") + 1 : i.rindex(".")] if files and not record_list: record_list = [TCSeqRec(f, file_format=file_format, name=get_name(f), datatype=datatype) for f in files] elif not files and not record_list: print "Can't load records - no records or alignment files given" return records_to_keys = dict([(record.name, number) for (number, record) in enumerate(record_list)]) keys_to_records = dict(enumerate(record_list)) self.records = record_list self.length = len(record_list) self.records_to_keys = records_to_keys self.keys_to_records = keys_to_records def load_phyml_results(self, input_dir, records=None, use_hashname=False, program="phyml"): if not records: records = self.get_records() failures = [] for rec in records: if use_hashname: name = rec.hashname() else: name = rec.name tree_file = "{0}/{1}.phy_phyml_tree.txt".format(input_dir, name) stats_file = "{0}/{1}.phy_phyml_stats.txt".format(input_dir, name) try: rec.tree.load_phyml_results(tree_file, stats_file, name=rec.name, program=program) except FileError: failures.append(rec.name) if failures: print "Couldn't load results for the following records:" for f in failures: print " ", f def sanitise_records(self): """ Sorts records alphabetically, trims whitespace from beginning of record headers, removes '/' characters from headers, replaces spaces with underscores, puts sequences into upper case """ for rec in self.get_records(): rec.sanitise() def put_dv_matrices(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True): for rec in self.get_records(): rec.dv = [rec.get_dv_matrix(tmpdir=tmpdir, helper=helper, overwrite=overwrite)] def put_trees( self, rec_list=None, program="treecollection", model=None, datatype=None, ncat=4, optimise="n", tmpdir=None, overwrite=True, verbose=False, ): if tmpdir is None: tmpdir = self.tmpdir if not program in ["treecollection", "raxml", "phyml", "bionj"]: print "unrecognised program {0}".format(program) return if not rec_list: rec_list = self.records for rec in rec_list: if overwrite is False: if rec.name in self.inferred_trees: continue if program == "treecollection": tree = rec.get_TC_tree(tmpdir=tmpdir, overwrite=overwrite) elif program == "raxml": tree = rec.get_raxml_tree(tmpdir=tmpdir, overwrite=overwrite) elif program == "phyml": tree = rec.get_phyml_tree( model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite, verbose=verbose ) elif program == "bionj": tree = rec.get_bionj_tree( model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, optimise=optimise, overwrite=overwrite, verbose=verbose, ) self.inferred_trees[rec.name] = tree def put_distance_matrices(self, metrics, tmpdir="/tmp", normalise=False): """ Pass this function a list of metrics valid kwargs - invert (bool), normalise (bool) """ if not isinstance(metrics, list): metrics = [metrics] trees = [rec.tree for rec in self.get_records()] for metric in metrics: dm = DistanceMatrix(trees, tmpdir=tmpdir) dm.get_distance_matrix(metric, normalise=normalise) self.distance_matrices[metric] = dm def put_partition(self, metric, cluster_method, nclusters, prune=True, tmpdir=None, recalculate=False): if not tmpdir: tmpdir = self.tmpdir if not metric in self.get_distance_matrices(): self.put_distance_matrices(metric, tmpdir=tmpdir) partition_vector = self.Clustering.run_clustering( self.distance_matrices[metric], cluster_method, nclusters, prune=prune, recalculate=recalculate ) self.clusters_to_partitions[(metric, cluster_method, nclusters)] = partition_vector self.partitions[partition_vector] = Partition(partition_vector) return partition_vector def put_partition_vector(self, partition_vector, name): """ Given a partition vector (i.e. a tuple containing the class- membership for each gene alignment), inserts the relevant data structures into the SequenceCollection object. NEXT: run concatenate_records(), put_cluster_trees() """ self.clusters_to_partitions[name] = partition_vector self.partitions[partition_vector] = Partition(partition_vector) def put_partitions(self, metrics, cluster_methods, nclusters, prune=True, tmpdir=None, recalculate=False): """ metrics, linkages and nclasses are given as lists, or coerced into lists """ if not isinstance(metrics, list): metrics = [metrics] if not isinstance(cluster_methods, list): cluster_methods = [cluster_methods] if not isinstance(nclusters, list): nclusters = [nclusters] if tmpdir is None: tmpdir = self.tmpdir else: nclusters = sorted(nclusters, reverse=True) # names = [rec.name for rec in self.get_records()] for metric in metrics: print "Clustering {0} data".format(metric) self.Clustering.clear_cache() for cluster_method in cluster_methods: print " ", cluster_method for n in nclusters: key = (metric, cluster_method, n) if key in self.clusters_to_partitions: continue else: self.put_partition( metric, cluster_method, n, prune=prune, tmpdir=tmpdir, recalculate=recalculate ) def concatenate_records(self): for p in self.partitions.values(): p.concatenate_records(self.keys_to_records) for concat in p.concats: if not concat[0].name in self.concats: self.concats[concat[0].name] = concat def autotune( self, metric, prune=True, KMeans=True, recalculate=True, tmpdir=None, max_groups=None, min_groups=2, check_single=True, ): """ Uses Perona and Zelnick-Manor's spectral rotation method to determine the number of clusters present in the data """ if not tmpdir: tmpdir = self.tmpdir if not metric in self.get_distance_matrices(): self.put_distance_matrices(metric, tmpdir=tmpdir) dm = self.get_distance_matrices()[metric] if check_single and min_groups > 1: print "Checking for single cluster..." (partition_vector, nclusters, quality_scores) = self.Clustering.run_spectral_rotate( dm, prune=prune, KMeans=KMeans, recalculate=recalculate, max_groups=6, min_groups=1, verbose=False ) if nclusters == 1: print "Single cluster found." print "Quality Scores: {0}".format(quality_scores) self.clusters_to_partitions[(metric, "rotate", nclusters)] = partition_vector self.partitions[partition_vector] = Partition(partition_vector) return (partition_vector, quality_scores) else: print ">1 clusters found." print "Quality Scores: {0}".format(quality_scores) recalculate = False (partition_vector, nclusters, quality_scores) = self.Clustering.run_spectral_rotate( dm, prune=prune, KMeans=KMeans, recalculate=recalculate, max_groups=max_groups, min_groups=min_groups ) self.clusters_to_partitions[(metric, "rotate", nclusters)] = partition_vector self.partitions[partition_vector] = Partition(partition_vector) return (partition_vector, quality_scores) def put_cluster_trees( self, program="treecollection", model=None, datatype=None, ncat=4, optimise="n", tmpdir="/tmp", overwrite=True, max_guide_trees=True, ): if program not in ["treecollection", "raxml", "phyml", "bionj"]: print "unrecognised program {0}".format(program) return if program == "treecollection": return self._put_best_TC_trees(tmpdir=tmpdir, overwrite=overwrite, max_guide_trees=max_guide_trees) rec_list = self.get_cluster_records() print "Inferring {0} cluster trees".format(len(rec_list)) self.put_trees( rec_list=rec_list, program=program, model=model, ncat=ncat, optimise=optimise, datatype=datatype, tmpdir=tmpdir, overwrite=overwrite, ) self.update_scores() def _put_best_TC_trees(self, tmpdir="/tmp", overwrite=True, max_guide_trees=-1): rec_list = self.get_cluster_records_with_memberships() for (rec, members) in rec_list: print "Calculating treecollection tree for {0}".format(rec.name), if rec.name in self.inferred_trees and overwrite == False: print "Skipping - already calculated (overwrite set to False)" continue guidetrees = [self.keys_to_records[member].tree for member in members] if max_guide_trees > 0: guidetrees = guidetrees[:max_guide_trees] TCtrees = [] pref = rec._write_temp_tc(make_guide_tree=False, tmpdir=tmpdir) pref = "{0}/{1}".format(tmpdir, pref) dv_file = pref + "_dv.txt" labels_file = pref + "_labels.txt" map_file = pref + "_map.txt" if len(guidetrees) > 1: print "(using best of {0} guidetrees)".format(len(guidetrees)) else: print "(using single guidetree)" for t in guidetrees: guidetree_file = "{0}/{1}.nwk".format(tmpdir, t.name) n = t.reroot_newick() with open(guidetree_file, "w") as writer: writer.write(n) TCtrees.append(Tree.new_treecollection_tree(dv_file, map_file, labels_file, guidetree_file, rec.name)) best = min(TCtrees, key=lambda x: x.score) rec.tree = best self.inferred_trees[rec.name] = best self.update_scores() def update_scores(self): for partition in self.partitions.values(): partition.update_score(self.concats) def _pivot(lst): new_lst = zip(*lst) return ["".join(x) for x in new_lst] def concatenate_list_of_records(self, records=None): if not records: records = self.get_records() concat = copy.deepcopy(records[0]) for rec in records[1:]: concat += rec return concat def make_randomised_copy(self, tmpdir=None, get_distances=False, parallel_load=False, overwrite=True): shuffled_records = self.get_randomised_alignments() if not tmpdir: tmpdir = self.tmpdir randomised_copy = SequenceCollection( input_dir=None, records=shuffled_records, file_format=self.file_format, datatype=self.datatype, helper=self.helper, tmpdir=tmpdir, get_distances=get_distances, parallel_load=parallel_load, overwrite=overwrite, ) return randomised_copy def show_memberships(self): partitions = self.get_partitions() for compound_key in partitions: print " ".join(str(x) for x in compound_key) partition = partitions[compound_key] print partition print self.clustering.get_memberships(partition) def simulate_from_record( self, record, output_dir, name, tmpdir, datatype=None, allow_nonsense=False, split_lengths=None, gene_names=None ): if not datatype: datatype = self.datatype if datatype == "protein": SeqSim.simulate_from_record_WAG(record, output_dir, name, tmpdir, allow_nonsense, split_lengths, gene_names) elif datatype == "dna": SeqSim.simulate_from_record_GTR(record, output_dir, name, tmpdir, allow_nonsense, split_lengths, gene_names) else: print "datatype {0} is not recognised".format(datatype) def simulate_from_result(self, key, output_dir, name, tmpdir, datatype=None, allow_nonsense=False): if not datatype: datatype = self.datatype p = self.get_partition(key) for c in p.concats: updated_record = self.concats[c.name][0] # bug: records in Partition # objects aren't linked # to trees members = c.name.split("-") lengths = [self.keys_to_records[int(x)].seqlength for x in members] names = ["sim" + self.keys_to_records[int(x)].name for x in members] self.simulate_from_record( updated_record, output_dir, name=name, tmpdir=tmpdir, allow_nonsense=allow_nonsense, split_lengths=lengths, gene_names=names, ) ####################### # Getters ####################### def get_trees(self): return [rec.tree for rec in self.get_records()] def get_cluster_records(self): """ Returns all concatenated records from cluster analysis """ sort_key = lambda item: tuple( (int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item[0].name) ) return [rec for (rec, _) in sorted(self.concats.values(), key=sort_key)] def get_cluster_records_with_memberships(self): """ Returns all concatenated records from cluster analysis """ sort_key = lambda item: tuple( (int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item[0].name) ) return sorted(self.concats.values(), key=sort_key) def get_cluster_trees(self): records = self.get_cluster_records() trees = [rec.tree for rec in records] return trees def get_score(self, key): return self.get_partition(key).score def get_partition(self, key): partition_vector = self.clusters_to_partitions[key] return self.partitions[partition_vector] def get_membership(self, key, flatten=False): return self.get_partition(key).get_membership(flatten=flatten) def get_partitions(self): return [(k, self.partitions[v]) for (k, v) in self.clusters_to_partitions.items()] def get_memberships(self, flatten=False): return [ (k, self.partitions[v].get_membership(flatten=flatten)) for (k, v) in self.clusters_to_partitions.items() ] def get_scores(self): return [(k, self.partitions[v].score) for (k, v) in self.clusters_to_partitions.items()] def get_randomised_alignments(self): lengths = [rec.seqlength for rec in self.get_records()] names = self.get_names() datatype = self.records[0].datatype concat = self.concatenate_list_of_records() concat.shuffle() newrecs = concat.split_by_lengths(lengths, names) return newrecs def get_records(self): """ Returns list of stored sequence records """ return [self.keys_to_records[i] for i in range(self.length)] def get_names(self): """ Returns a list of the names of the stored records """ return [rec.name for rec in self.get_records()] def get_seqlengths(self): """ Returns a list of the sequence lengths of the stored records """ return [rec.seqlength for rec in self.get_records()] def get_distance_matrices(self): return self.distance_matrices def get_dv_matrices(self): dvs = {} for rec in self.get_records(): dvs[rec.name] = rec.dv return dvs ######################### # Plotters ######################### def plot_dendrogram(self, metric, link, nclasses, show=True): plot_object = self.clustering.plot_dendrogram((metric, link, nclasses)) if show: plot_object.show() return plot_object def plot_heatmap(self, distance_matrix, partition, outfile=None): sort_partition = partition.get_membership(flatten=True) fig = distance_matrix.plot_heatmap(sort_partition=sort_partition) if outfile: fig.savefig("{0}.pdf".format(outfile)) return fig def plot_embedding( self, partition_vector, distance_matrix, embedding="MDS", prune=True, dimensions=3, centre_of_mass=False, outfile=None, standardize=False, normalise=False, annotate=False, ): """ Plots an embedding of the trees in a Principal Coordinate space, and saves as pdf. """ dm = distance_matrix.matrix partition_vector = np.array(partition_vector) labels = self.get_names() if embedding == "MDS": dbc = self.Clustering.get_double_centre(dm) (vals, vecs, var_exp) = self.Clustering.get_eigen(dbc, standardize=standardize) (coords, _) = self.Clustering.get_coords_by_dimension(vals, vecs, var_exp, 3, normalise=normalise) elif embedding == "spectral": laplacian = self.Clustering.spectral(dm, prune=prune) (vals, vecs, var_exp) = self.Clustering.get_eigen(laplacian, standardize=standardize) (coords, _) = self.Clustering.get_coords_by_dimension(vals, vecs, var_exp, 3, normalise=normalise) else: print "embedding should be one of 'MDS' or 'spectral'" print "value given was:", embedding return min_Z = min([z for (x, y, z) in coords]) P = [] # get the indices of the partition vector for each group # and store in this list max_groups = max(partition_vector) for i in range(1, max_groups + 1): partition = np.where(partition_vector == i) P.append(partition) colors = "bgrcmyk" coldict = {"b": "blue", "g": "green", "r": "red", "c": "cyan", "m": "magenta", "y": "yellow", "k": "black"} fig2d = plt.figure() fig3d = plt.figure() ax2d = fig2d.add_subplot(111) ax3d = fig3d.add_subplot(111, projection="3d") for (pos, partition) in enumerate(P): for i in partition[0]: ax2d.scatter(color=colors[pos % len(colors)], *(coords[i])[:2]) ax3d.scatter(color=colors[pos % len(colors)], *coords[i]) ax3d.plot( [coords[i][0], coords[i][0]], [coords[i][1], coords[i][1]], [min_Z, coords[i][2]], color="grey", linewidth=0.2, ) if annotate: ax2d.annotate( labels[i], xy=(coords[i][0], coords[i][1]), xytext=(-20, 20), textcoords="offset points", fontsize="x-small", ha="right", va="bottom", bbox=dict(boxstyle="round,pad=0.5", fc="yellow", alpha=0.5), arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0"), ) if centre_of_mass: com = np.mean(coords[partition], axis=0) ax2d.scatter(color="k", marker="x", s=2, *com[:2]) ax3d.scatter(color="k", marker="x", s=2, *com) if embedding == "spectral" and normalise: (u, v) = np.mgrid[0 : 2 * np.pi : 20j, 0 : np.pi : 10j] x = np.cos(u) * np.sin(v) y = np.sin(u) * np.sin(v) z = np.cos(v) ax3d.plot_wireframe(x, y, z, color="grey", linewidth=0.2) ax2d.set_xlabel("PCo1") ax2d.set_ylabel("PCo2") ax2d.set_title("Trees embedded in dimension-reduced space") ax3d.set_xlabel("PCo1") ax3d.set_ylabel("PCo2") ax3d.set_zlabel("PCo3") ax3d.set_title("Trees embedded in dimension-reduced space") if outfile: fig2d.savefig("{0}-2d.pdf".format(outfile)) fig3d.savefig("{0}-3d.pdf".format(outfile)) return (fig2d, fig3d) ######################### # Parallelisers ######################### def _unpack_dv(self, packed_args): return packed_args[0].get_dv_matrix(*packed_args[1:]) def _dv_parallel_call(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True): nprocesses = min(self.length, multiprocessing.cpu_count() - 1) print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, self.length) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in self.get_records(): new_dir = tmpdir + "/" + rec.name if not os.path.isdir(new_dir): os.mkdir(new_dir) args.append((rec, tmpdir + "/" + rec.name, helper, overwrite)) names.append(rec.name) r = pool.map_async(self._unpack_dv, args, callback=results.append) r.wait() for (w, x, y, z) in args: if os.path.isdir(x): os.rmdir(x) results = results[0] print "Results obtained, closing pool..." pool.close() pool.join() print "Pool closed" return dict(zip(names, results)) def put_dv_matrices_parallel(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True): dv_matrices_dict = self._dv_parallel_call(tmpdir, helper, overwrite=overwrite) for rec in self.get_records(): rec.dv = [dv_matrices_dict[rec.name]] def _unpack_bionj(self, packed_args): return packed_args[0].get_bionj_tree(*packed_args[1:]) def _bionj_parallel_call(self, model=None, datatype=None, rec_list=None, ncat=1, tmpdir="/tmp", overwrite=True): if not rec_list: rec_list = self.records nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1) print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list)) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in rec_list: args.append((rec, model, datatype, ncat, tmpdir, overwrite)) names.append(rec.name) r = pool.map_async(self._unpack_bionj, args, callback=results.append) r.wait() print "Results obtained, closing pool..." pool.close() pool.join() print "Pool closed" return dict(zip(names, results[0])) def _unpack_phyml(self, packed_args): return packed_args[0].get_phyml_tree(*packed_args[1:]) def _phyml_parallel_call(self, model=None, datatype=None, rec_list=None, ncat=4, tmpdir="/tmp", overwrite=True): if not rec_list: rec_list = self.records nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1) print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list)) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in rec_list: args.append((rec, model, datatype, ncat, tmpdir, overwrite)) names.append(rec.name) r = pool.map_async(self._unpack_phyml, args, callback=results.append) r.wait() print "Results obtained, closing pool..." pool.close() pool.join() print "Pool closed" return dict(zip(names, results[0])) def _unpack_raxml(self, packed_args): return packed_args[0].get_raxml_tree(*packed_args[1:]) def _raxml_parallel_call(self, rec_list=None, tmpdir="/tmp", overwrite=True): if not rec_list: rec_list = self.records nprocesses = multiprocessing.cpu_count() - 1 print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list)) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in rec_list: args.append((rec, tmpdir, overwrite)) names.append(rec.name) r = pool.map_async(self._unpack_raxml, args, callback=results.append) r.wait() pool.close() pool.join() return dict(zip(names, results[0])) def _unpack_TC(self, packed_args): return packed_args[0].get_TC_tree(*packed_args[1:]) def _TC_parallel_call(self, rec_list=None, tmpdir="/tmp", overwrite=True): if not rec_list: rec_list = self.records nprocesses = multiprocessing.cpu_count() - 1 print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list)) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in rec_list: args.append((rec, tmpdir, overwrite)) names.append(rec.name) r = pool.map_async(self._unpack_TC, args, callback=results.append) r.wait() pool.close() pool.join() return dict(zip(names, results[0])) def put_trees_parallel( self, rec_list=None, program="treecollection", model=None, datatype=None, ncat=4, tmpdir="/tmp", overwrite=True ): if not program in ["treecollection", "raxml", "phyml", "bionj"]: print "unrecognised program {0}".format(program) return if not rec_list: rec_list = self.records if program == "treecollection": trees_dict = self._TC_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite) elif program == "raxml": trees_dict = self._raxml_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite) elif program == "phyml": trees_dict = self._phyml_parallel_call( rec_list=rec_list, model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite ) elif program == "bionj": trees_dict = self._bionj_parallel_call( rec_list=rec_list, model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite ) for rec in self.get_records(): rec.tree = trees_dict[rec.name] self.inferred_trees[rec.name] = trees_dict[rec.name] def put_cluster_trees_parallel( self, program="treecollection", model=None, datatype=None, ncat=4, tmpdir="/tmp", overwrite=True ): if program not in ["treecollection", "raxml", "phyml", "bionj"]: print "unrecognised program {0}".format(program) return rec_list = self.get_cluster_records() print "Inferring {0} cluster trees".format(len(rec_list)) if program == "treecollection": cluster_trees_dict = self._TC_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite) elif program == "raxml": cluster_trees_dict = self._raxml_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite) elif program == "phyml": cluster_trees_dict = self._phyml_parallel_call( rec_list=rec_list, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir, overwrite=overwrite ) elif program == "bionj": cluster_trees_dict = self._bionj_parallel_call( rec_list=rec_list, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir, overwrite=overwrite ) for rec in rec_list: rec.tree = cluster_trees_dict[rec.name] self.update_results()
from clustering import Clustering cl = Clustering('magia_cluster_data') #cl = Clustering('data_sample') #cl.generate_clusters(eps=0.5, min_samples=5) #cl.recommend_n(5931298, 4) cl.spectral(8)
class SequenceCollection(object): """ Orchestrating class that should: a) work as a central repository for the information generated by the subordinate classes, and b) be the only class directly interacted with by the user TO DO: implement consistent naming of methods (where appropriate) Prefixes: get_[something] - returns the object implied by something put_[something] - puts something in the class data structure show_[something] - prints something to screen plot_[something] - displays a plot of something _[something] - private method """ def __init__( self, input_dir=None, records=None, file_format='fasta', datatype='protein', helper='./class_files/DV_wrapper.drw', tmpdir='/tmp', get_distances=False, parallel_load=False, overwrite=True, ): # Unset Variables # Store some mappings for data retrieval self.records_to_keys = {} self.keys_to_records = {} self.clusters_to_partitions = {} self.partitions = {} self.distance_matrices = {} self.concats = {} self.inferred_trees = {} self.Clustering = Clustering() # Store some data self.files = None self.file_format = file_format self.datatype = datatype self.records = [] self.length = 0 self.helper = helper # Set Variables self.tmpdir = tmpdir # Lambda for sorting by name and number sort_key = lambda item: tuple( (int(num) if num else alpha) for (num, alpha) in re.findall(r'(\d+)|(\D+)', item)) # Can give an input directory as optional argument # If given: # read the alignment files # optionally calculate pairwise distances # store the sequence data if input_dir: files = self.get_files(input_dir, file_format) # file checks if files == 0: print '!!!' print 'There was a problem reading files from {0}'.format( input_dir) print '!!!' sys.exit() if get_distances and not os.path.isfile(helper): print '!!!' print 'There was a problem finding the darwin helper at {0}'.format( helper) print '!!!' sys.exit() # done files.sort(key=sort_key) self.put_records(files=files, record_list=None, file_format=file_format, datatype=datatype) # takes care of self.length for us self.sanitise_records() if not os.path.isdir(tmpdir): os.mkdir(tmpdir) elif records: # Can optionally give record objects directly if no input dir specified self.put_records(files=None, record_list=records, file_format=file_format, datatype=datatype) # takes care of self.length for us self.sanitise_records() # Optionally use Darwin to calculate pairwise distances if get_distances and self.records: if parallel_load: self.put_dv_matrices_parallel(helper=helper, tmpdir=tmpdir, overwrite=overwrite) else: self.put_dv_matrices(helper=helper, tmpdir=tmpdir, overwrite=overwrite) def __str__(self): s = 'SequenceCollection object:\n' s += 'Contains {0} alignments\n'.format(self.length) return s def __len__(self): return self.length def get_files(self, input_dir, file_format='fasta'): """ Get list of alignment files from an input directory *.fa, *.fas and *.phy files only Stores in self.files """ if file_format == 'fasta': files = glob.glob('{0}/*.fa'.format(input_dir)) if len(files) == 0: files = glob.glob('{0}/*.fas'.format(input_dir)) elif file_format == 'phylip': files = glob.glob('{0}/*.phy'.format(input_dir)) else: print 'Unrecognised file format %s' % file_format files = None if not files: print 'No sequence files found in {0}'.format(input_dir) return 0 return sorted(files) def dump_records( self, output_dir, records=None, file_format='phylip', use_hashname=True, ): """ Dumps all sequence alignment records to an output directory Files are dumped in sequential phylip format; by default the names are hashed """ directorycheck_and_make(output_dir) hash_translation = {} if not records: records = self.get_records() for rec in records: filename = rec._write_temp_phylip(output_dir, use_hashname=use_hashname) try: hash_translation[str(rec.name)] = filename except TypeError: print type(rec.name), rec.name, type(filename), filename cPickle.dump(hash_translation, open('{0}/hash_translation.pkl'.format(output_dir), 'w')) def hash(self, string): H = hashlib.sha1(string) return H.hexdigest() def gzip(self, filename): if not filename.endswith('.gz'): filename += '.gz' cPickle.dump(self, file=gz.open(filename, 'wb'), protocol=-1) @classmethod def gunzip(cls, filename): return cPickle.load(gz.open(filename, 'rb')) def put_records( self, files=None, record_list=None, file_format='fasta', datatype='protein', ): """ Reads sequence files from the list generated by get_files and stores in self.records """ get_name = lambda i: i[i.rindex('/') + 1:i.rindex('.')] if files and not record_list: record_list = [ TCSeqRec(f, file_format=file_format, name=get_name(f), datatype=datatype) for f in files ] elif not files and not record_list: print 'Can\'t load records - no records or alignment files given' return records_to_keys = dict([(record.name, number) for (number, record) in enumerate(record_list) ]) keys_to_records = dict(enumerate(record_list)) self.records = record_list self.length = len(record_list) self.records_to_keys = records_to_keys self.keys_to_records = keys_to_records def load_phyml_results( self, input_dir, records=None, use_hashname=False, program='phyml', ): if not records: records = self.get_records() failures = [] for rec in records: if use_hashname: name = rec.hashname() else: name = rec.name tree_file = '{0}/{1}.phy_phyml_tree.txt'.format(input_dir, name) stats_file = \ '{0}/{1}.phy_phyml_stats.txt'.format(input_dir, name) try: rec.tree.load_phyml_results(tree_file, stats_file, name=rec.name, program=program) except FileError: failures.append(rec.name) if failures: print 'Couldn\'t load results for the following records:' for f in failures: print ' ', f def sanitise_records(self): """ Sorts records alphabetically, trims whitespace from beginning of record headers, removes '/' characters from headers, replaces spaces with underscores, puts sequences into upper case """ for rec in self.get_records(): rec.sanitise() def put_dv_matrices( self, tmpdir='/tmp', helper='./class_files/DV_wrapper.drw', overwrite=True, ): for rec in self.get_records(): rec.dv = [ rec.get_dv_matrix(tmpdir=tmpdir, helper=helper, overwrite=overwrite) ] def put_trees( self, rec_list=None, program='treecollection', model=None, datatype=None, ncat=4, optimise='n', tmpdir=None, overwrite=True, verbose=False, ): if tmpdir is None: tmpdir = self.tmpdir if not program in ['treecollection', 'raxml', 'phyml', 'bionj']: print 'unrecognised program {0}'.format(program) return if not rec_list: rec_list = self.records for rec in rec_list: if overwrite is False: if rec.name in self.inferred_trees: continue if program == 'treecollection': tree = rec.get_TC_tree(tmpdir=tmpdir, overwrite=overwrite) elif program == 'raxml': tree = rec.get_raxml_tree(tmpdir=tmpdir, overwrite=overwrite) elif program == 'phyml': tree = rec.get_phyml_tree( model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite, verbose=verbose, ) elif program == 'bionj': tree = rec.get_bionj_tree( model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, optimise=optimise, overwrite=overwrite, verbose=verbose, ) self.inferred_trees[rec.name] = tree def put_distance_matrices( self, metrics, tmpdir='/tmp', normalise=False, ): """ Pass this function a list of metrics valid kwargs - invert (bool), normalise (bool) """ if not isinstance(metrics, list): metrics = [metrics] trees = [rec.tree for rec in self.get_records()] for metric in metrics: dm = DistanceMatrix(trees, tmpdir=tmpdir) dm.get_distance_matrix(metric, normalise=normalise) self.distance_matrices[metric] = dm def put_partition( self, metric, cluster_method, nclusters, prune=True, tmpdir=None, recalculate=False, ): if not tmpdir: tmpdir = self.tmpdir if not metric in self.get_distance_matrices(): self.put_distance_matrices(metric, tmpdir=tmpdir) partition_vector = \ self.Clustering.run_clustering(self.distance_matrices[metric], cluster_method, nclusters, prune=prune, recalculate=recalculate) self.clusters_to_partitions[(metric, cluster_method, nclusters)] = partition_vector self.partitions[partition_vector] = Partition(partition_vector) return partition_vector def put_partition_vector(self, partition_vector, name): """ Given a partition vector (i.e. a tuple containing the class- membership for each gene alignment), inserts the relevant data structures into the SequenceCollection object. NEXT: run concatenate_records(), put_cluster_trees() """ self.clusters_to_partitions[name] = partition_vector self.partitions[partition_vector] = Partition(partition_vector) def put_partitions( self, metrics, cluster_methods, nclusters, prune=True, tmpdir=None, recalculate=False, ): """ metrics, linkages and nclasses are given as lists, or coerced into lists """ if not isinstance(metrics, list): metrics = [metrics] if not isinstance(cluster_methods, list): cluster_methods = [cluster_methods] if not isinstance(nclusters, list): nclusters = [nclusters] if tmpdir is None: tmpdir = self.tmpdir else: nclusters = sorted(nclusters, reverse=True) # names = [rec.name for rec in self.get_records()] for metric in metrics: print 'Clustering {0} data'.format(metric) self.Clustering.clear_cache() for cluster_method in cluster_methods: print ' ', cluster_method for n in nclusters: key = (metric, cluster_method, n) if key in self.clusters_to_partitions: continue else: self.put_partition( metric, cluster_method, n, prune=prune, tmpdir=tmpdir, recalculate=recalculate, ) def concatenate_records(self): for p in self.partitions.values(): p.concatenate_records(self.keys_to_records) for concat in p.concats: if not concat[0].name in self.concats: self.concats[concat[0].name] = concat def autotune( self, metric, prune=True, KMeans=True, recalculate=True, tmpdir=None, max_groups=None, min_groups=2, check_single=True, ): """ Uses Perona and Zelnick-Manor's spectral rotation method to determine the number of clusters present in the data """ if not tmpdir: tmpdir = self.tmpdir if not metric in self.get_distance_matrices(): self.put_distance_matrices(metric, tmpdir=tmpdir) dm = self.get_distance_matrices()[metric] if check_single and min_groups > 1: print 'Checking for single cluster...' (partition_vector, nclusters, quality_scores) = \ self.Clustering.run_spectral_rotate( dm, prune=prune, KMeans=KMeans, recalculate=recalculate, max_groups=6, min_groups=1, verbose=False, ) if nclusters == 1: print 'Single cluster found.' print 'Quality Scores: {0}'.format(quality_scores) self.clusters_to_partitions[(metric, 'rotate', nclusters)] = partition_vector self.partitions[partition_vector] = \ Partition(partition_vector) return (partition_vector, quality_scores) else: print '>1 clusters found.' print 'Quality Scores: {0}'.format(quality_scores) recalculate = False (partition_vector, nclusters, quality_scores) = \ self.Clustering.run_spectral_rotate( dm, prune=prune, KMeans=KMeans, recalculate=recalculate, max_groups=max_groups, min_groups=min_groups, ) self.clusters_to_partitions[(metric, 'rotate', nclusters)] = \ partition_vector self.partitions[partition_vector] = Partition(partition_vector) return (partition_vector, quality_scores) def put_cluster_trees( self, program='treecollection', model=None, datatype=None, ncat=4, optimise='n', tmpdir='/tmp', overwrite=True, max_guide_trees=True, ): if program not in ['treecollection', 'raxml', 'phyml', 'bionj']: print 'unrecognised program {0}'.format(program) return if program == 'treecollection': return self._put_best_TC_trees(tmpdir=tmpdir, overwrite=overwrite, max_guide_trees=max_guide_trees) rec_list = self.get_cluster_records() print 'Inferring {0} cluster trees'.format(len(rec_list)) self.put_trees( rec_list=rec_list, program=program, model=model, ncat=ncat, optimise=optimise, datatype=datatype, tmpdir=tmpdir, overwrite=overwrite, ) self.update_scores() def _put_best_TC_trees( self, tmpdir='/tmp', overwrite=True, max_guide_trees=-1, ): rec_list = self.get_cluster_records_with_memberships() for (rec, members) in rec_list: print 'Calculating treecollection tree for {0}'.format(rec.name), if rec.name in self.inferred_trees and overwrite == False: print 'Skipping - already calculated (overwrite set to False)' continue guidetrees = [ self.keys_to_records[member].tree for member in members ] if max_guide_trees > 0: guidetrees = guidetrees[:max_guide_trees] TCtrees = [] pref = rec._write_temp_tc(make_guide_tree=False, tmpdir=tmpdir) pref = '{0}/{1}'.format(tmpdir, pref) dv_file = pref + '_dv.txt' labels_file = pref + '_labels.txt' map_file = pref + '_map.txt' if len(guidetrees) > 1: print '(using best of {0} guidetrees)'.format(len(guidetrees)) else: print '(using single guidetree)' for t in guidetrees: guidetree_file = '{0}/{1}.nwk'.format(tmpdir, t.name) n = t.reroot_newick() with open(guidetree_file, 'w') as writer: writer.write(n) TCtrees.append( Tree.new_treecollection_tree(dv_file, map_file, labels_file, guidetree_file, rec.name)) best = min(TCtrees, key=lambda x: x.score) rec.tree = best self.inferred_trees[rec.name] = best self.update_scores() def update_scores(self): for partition in self.partitions.values(): partition.update_score(self.concats) def _pivot(lst): new_lst = zip(*lst) return [''.join(x) for x in new_lst] def concatenate_list_of_records(self, records=None): if not records: records = self.get_records() concat = copy.deepcopy(records[0]) for rec in records[1:]: concat += rec return concat def make_randomised_copy( self, tmpdir=None, get_distances=False, parallel_load=False, overwrite=True, ): shuffled_records = self.get_randomised_alignments() if not tmpdir: tmpdir = self.tmpdir randomised_copy = SequenceCollection( input_dir=None, records=shuffled_records, file_format=self.file_format, datatype=self.datatype, helper=self.helper, tmpdir=tmpdir, get_distances=get_distances, parallel_load=parallel_load, overwrite=overwrite, ) return randomised_copy def show_memberships(self): partitions = self.get_partitions() for compound_key in partitions: print ' '.join(str(x) for x in compound_key) partition = partitions[compound_key] print partition print self.clustering.get_memberships(partition) def simulate_from_record( self, record, output_dir, name, tmpdir, datatype=None, allow_nonsense=False, split_lengths=None, gene_names=None, ): if not datatype: datatype = self.datatype if datatype == 'protein': SeqSim.simulate_from_record_WAG( record, output_dir, name, tmpdir, allow_nonsense, split_lengths, gene_names, ) elif datatype == 'dna': SeqSim.simulate_from_record_GTR( record, output_dir, name, tmpdir, allow_nonsense, split_lengths, gene_names, ) else: print 'datatype {0} is not recognised'.format(datatype) def simulate_from_result( self, key, output_dir, name, tmpdir, datatype=None, allow_nonsense=False, ): if not datatype: datatype = self.datatype p = self.get_partition(key) for c in p.concats: updated_record = self.concats[c.name][ 0] # bug: records in Partition # objects aren't linked # to trees members = c.name.split('-') lengths = [self.keys_to_records[int(x)].seqlength for x in members] names = [ 'sim' + self.keys_to_records[int(x)].name for x in members ] self.simulate_from_record( updated_record, output_dir, name=name, tmpdir=tmpdir, allow_nonsense=allow_nonsense, split_lengths=lengths, gene_names=names, ) ####################### # Getters ####################### def get_trees(self): return [rec.tree for rec in self.get_records()] def get_cluster_records(self): """ Returns all concatenated records from cluster analysis """ sort_key = lambda item: tuple( (int(num) if num else alpha) for (num, alpha) in re.findall(r'(\d+)|(\D+)', item[0].name)) return [ rec for (rec, _) in sorted(self.concats.values(), key=sort_key) ] def get_cluster_records_with_memberships(self): """ Returns all concatenated records from cluster analysis """ sort_key = lambda item: tuple( (int(num) if num else alpha) for (num, alpha) in re.findall(r'(\d+)|(\D+)', item[0].name)) return sorted(self.concats.values(), key=sort_key) def get_cluster_trees(self): records = self.get_cluster_records() trees = [rec.tree for rec in records] return trees def get_score(self, key): return self.get_partition(key).score def get_partition(self, key): partition_vector = self.clusters_to_partitions[key] return self.partitions[partition_vector] def get_membership(self, key, flatten=False): return self.get_partition(key).get_membership(flatten=flatten) def get_partitions(self): return [(k, self.partitions[v]) for (k, v) in self.clusters_to_partitions.items()] def get_memberships(self, flatten=False): return [(k, self.partitions[v].get_membership(flatten=flatten)) for (k, v) in self.clusters_to_partitions.items()] def get_scores(self): return [(k, self.partitions[v].score) for (k, v) in self.clusters_to_partitions.items()] def get_randomised_alignments(self): lengths = [rec.seqlength for rec in self.get_records()] names = self.get_names() datatype = self.records[0].datatype concat = self.concatenate_list_of_records() concat.shuffle() newrecs = concat.split_by_lengths(lengths, names) return newrecs def get_records(self): """ Returns list of stored sequence records """ return [self.keys_to_records[i] for i in range(self.length)] def get_names(self): """ Returns a list of the names of the stored records """ return [rec.name for rec in self.get_records()] def get_seqlengths(self): """ Returns a list of the sequence lengths of the stored records """ return [rec.seqlength for rec in self.get_records()] def get_distance_matrices(self): return self.distance_matrices def get_dv_matrices(self): dvs = {} for rec in self.get_records(): dvs[rec.name] = rec.dv return dvs ######################### # Plotters ######################### def plot_dendrogram( self, metric, link, nclasses, show=True, ): plot_object = self.clustering.plot_dendrogram((metric, link, nclasses)) if show: plot_object.show() return plot_object def plot_heatmap( self, distance_matrix, partition, outfile=None, ): sort_partition = partition.get_membership(flatten=True) fig = \ distance_matrix.plot_heatmap(sort_partition=sort_partition) if outfile: fig.savefig('{0}.pdf'.format(outfile)) return fig def plot_embedding( self, partition_vector, distance_matrix, embedding='MDS', prune=True, dimensions=3, centre_of_mass=False, outfile=None, standardize=False, normalise=False, annotate=False, ): """ Plots an embedding of the trees in a Principal Coordinate space, and saves as pdf. """ dm = distance_matrix.matrix partition_vector = np.array(partition_vector) labels = self.get_names() if embedding == 'MDS': dbc = self.Clustering.get_double_centre(dm) (vals, vecs, var_exp) = self.Clustering.get_eigen(dbc, standardize=standardize) (coords, _) = self.Clustering.get_coords_by_dimension(vals, vecs, var_exp, 3, normalise=normalise) elif embedding == 'spectral': laplacian = self.Clustering.spectral(dm, prune=prune) (vals, vecs, var_exp) = \ self.Clustering.get_eigen(laplacian, standardize=standardize) (coords, _) = self.Clustering.get_coords_by_dimension(vals, vecs, var_exp, 3, normalise=normalise) else: print 'embedding should be one of \'MDS\' or \'spectral\'' print 'value given was:', embedding return min_Z = min([z for (x, y, z) in coords]) P = [] # get the indices of the partition vector for each group # and store in this list max_groups = max(partition_vector) for i in range(1, max_groups + 1): partition = np.where(partition_vector == i) P.append(partition) colors = 'bgrcmyk' coldict = { 'b': 'blue', 'g': 'green', 'r': 'red', 'c': 'cyan', 'm': 'magenta', 'y': 'yellow', 'k': 'black', } fig2d = plt.figure() fig3d = plt.figure() ax2d = fig2d.add_subplot(111) ax3d = fig3d.add_subplot(111, projection='3d') for (pos, partition) in enumerate(P): for i in partition[0]: ax2d.scatter(color=colors[pos % len(colors)], *(coords[i])[:2]) ax3d.scatter(color=colors[pos % len(colors)], *coords[i]) ax3d.plot([coords[i][0], coords[i][0]], [coords[i][1], coords[i][1]], [min_Z, coords[i][2]], color='grey', linewidth=0.2) if annotate: ax2d.annotate( labels[i], xy=(coords[i][0], coords[i][1]), xytext=(-20, 20), textcoords='offset points', fontsize='x-small', ha='right', va='bottom', bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'), ) if centre_of_mass: com = np.mean(coords[partition], axis=0) ax2d.scatter(color='k', marker='x', s=2, *com[:2]) ax3d.scatter(color='k', marker='x', s=2, *com) if embedding == 'spectral' and normalise: (u, v) = np.mgrid[0:2 * np.pi:20j, 0:np.pi:10j] x = np.cos(u) * np.sin(v) y = np.sin(u) * np.sin(v) z = np.cos(v) ax3d.plot_wireframe(x, y, z, color='grey', linewidth=0.2) ax2d.set_xlabel('PCo1') ax2d.set_ylabel('PCo2') ax2d.set_title('Trees embedded in dimension-reduced space') ax3d.set_xlabel('PCo1') ax3d.set_ylabel('PCo2') ax3d.set_zlabel('PCo3') ax3d.set_title('Trees embedded in dimension-reduced space') if outfile: fig2d.savefig('{0}-2d.pdf'.format(outfile)) fig3d.savefig('{0}-3d.pdf'.format(outfile)) return (fig2d, fig3d) ######################### # Parallelisers ######################### def _unpack_dv(self, packed_args): return packed_args[0].get_dv_matrix(*packed_args[1:]) def _dv_parallel_call( self, tmpdir='/tmp', helper='./class_files/DV_wrapper.drw', overwrite=True, ): nprocesses = min(self.length, multiprocessing.cpu_count() - 1) print 'Initialising a pool of {0} processes running {1} jobs...'.format( nprocesses, self.length) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in self.get_records(): new_dir = tmpdir + '/' + rec.name if not os.path.isdir(new_dir): os.mkdir(new_dir) args.append((rec, tmpdir + '/' + rec.name, helper, overwrite)) names.append(rec.name) r = pool.map_async(self._unpack_dv, args, callback=results.append) r.wait() for (w, x, y, z) in args: if os.path.isdir(x): os.rmdir(x) results = results[0] print 'Results obtained, closing pool...' pool.close() pool.join() print 'Pool closed' return dict(zip(names, results)) def put_dv_matrices_parallel( self, tmpdir='/tmp', helper='./class_files/DV_wrapper.drw', overwrite=True, ): dv_matrices_dict = self._dv_parallel_call(tmpdir, helper, overwrite=overwrite) for rec in self.get_records(): rec.dv = [dv_matrices_dict[rec.name]] def _unpack_bionj(self, packed_args): return packed_args[0].get_bionj_tree(*packed_args[1:]) def _bionj_parallel_call( self, model=None, datatype=None, rec_list=None, ncat=1, tmpdir='/tmp', overwrite=True, ): if not rec_list: rec_list = self.records nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1) print 'Initialising a pool of {0} processes running {1} jobs...'.format( nprocesses, len(rec_list)) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in rec_list: args.append(( rec, model, datatype, ncat, tmpdir, overwrite, )) names.append(rec.name) r = pool.map_async(self._unpack_bionj, args, callback=results.append) r.wait() print 'Results obtained, closing pool...' pool.close() pool.join() print 'Pool closed' return dict(zip(names, results[0])) def _unpack_phyml(self, packed_args): return packed_args[0].get_phyml_tree(*packed_args[1:]) def _phyml_parallel_call( self, model=None, datatype=None, rec_list=None, ncat=4, tmpdir='/tmp', overwrite=True, ): if not rec_list: rec_list = self.records nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1) print 'Initialising a pool of {0} processes running {1} jobs...'.format( nprocesses, len(rec_list)) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in rec_list: args.append(( rec, model, datatype, ncat, tmpdir, overwrite, )) names.append(rec.name) r = pool.map_async(self._unpack_phyml, args, callback=results.append) r.wait() print 'Results obtained, closing pool...' pool.close() pool.join() print 'Pool closed' return dict(zip(names, results[0])) def _unpack_raxml(self, packed_args): return packed_args[0].get_raxml_tree(*packed_args[1:]) def _raxml_parallel_call( self, rec_list=None, tmpdir='/tmp', overwrite=True, ): if not rec_list: rec_list = self.records nprocesses = multiprocessing.cpu_count() - 1 print 'Initialising a pool of {0} processes running {1} jobs...'.format( nprocesses, len(rec_list)) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in rec_list: args.append((rec, tmpdir, overwrite)) names.append(rec.name) r = pool.map_async(self._unpack_raxml, args, callback=results.append) r.wait() pool.close() pool.join() return dict(zip(names, results[0])) def _unpack_TC(self, packed_args): return packed_args[0].get_TC_tree(*packed_args[1:]) def _TC_parallel_call( self, rec_list=None, tmpdir='/tmp', overwrite=True, ): if not rec_list: rec_list = self.records nprocesses = multiprocessing.cpu_count() - 1 print 'Initialising a pool of {0} processes running {1} jobs...'.format( nprocesses, len(rec_list)) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in rec_list: args.append((rec, tmpdir, overwrite)) names.append(rec.name) r = pool.map_async(self._unpack_TC, args, callback=results.append) r.wait() pool.close() pool.join() return dict(zip(names, results[0])) def put_trees_parallel( self, rec_list=None, program='treecollection', model=None, datatype=None, ncat=4, tmpdir='/tmp', overwrite=True, ): if not program in ['treecollection', 'raxml', 'phyml', 'bionj']: print 'unrecognised program {0}'.format(program) return if not rec_list: rec_list = self.records if program == 'treecollection': trees_dict = self._TC_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite) elif program == 'raxml': trees_dict = self._raxml_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite) elif program == 'phyml': trees_dict = self._phyml_parallel_call( rec_list=rec_list, model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite, ) elif program == 'bionj': trees_dict = self._bionj_parallel_call( rec_list=rec_list, model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite, ) for rec in self.get_records(): rec.tree = trees_dict[rec.name] self.inferred_trees[rec.name] = trees_dict[rec.name] def put_cluster_trees_parallel( self, program='treecollection', model=None, datatype=None, ncat=4, tmpdir='/tmp', overwrite=True, ): if program not in ['treecollection', 'raxml', 'phyml', 'bionj']: print 'unrecognised program {0}'.format(program) return rec_list = self.get_cluster_records() print 'Inferring {0} cluster trees'.format(len(rec_list)) if program == 'treecollection': cluster_trees_dict = \ self._TC_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite) elif program == 'raxml': cluster_trees_dict = \ self._raxml_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite) elif program == 'phyml': cluster_trees_dict = self._phyml_parallel_call( rec_list=rec_list, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir, overwrite=overwrite, ) elif program == 'bionj': cluster_trees_dict = self._bionj_parallel_call( rec_list=rec_list, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir, overwrite=overwrite, ) for rec in rec_list: rec.tree = cluster_trees_dict[rec.name] self.update_results()