def get_decomp(self, method='MDS', **kwargs): optioncheck(method, ['MDS', 'spectral']) cl = Clustering(self.dm) if method == 'MDS': return cl.mds_decomp() if method == 'spectral': return cl.spectral_decomp(**kwargs)
def __init__( self, records=None, input_dir=None, param_dir=None, file_format='fasta', compression=None, header_grep=None, ): self._records = None self._input_files = None if records is not None: self.records = records elif input_dir is not None: input_dir = os.path.abspath(input_dir) directorycheck(input_dir) optioncheck(file_format, ['fasta', 'phylip']) self.records = self.read_alignments(input_dir, file_format, header_grep, compression) else: raise Exception('Provide a list of records, ' 'or the path to a set of alignments') if param_dir is not None: self.read_parameters(param_dir) if not self.records: raise NoRecordsError(file_format, input_dir, compression)
def embedding(self, dimensions, method, **kwargs): """ Embeds the distance matrix in a coordinate space. Implemented methods are: cmds: Classical MultiDimensional Scaling kpca: Kernel Principal Components Analysis mmds: Metric MultiDimensional Scaling nmmds: Non-Metric MultiDimensional Scaling spectral: Spectral decomposition of Laplacian matrix Valid kwargs: kpca: affinity_matrix - a precomputed array of affinities sigma - the value of sigma to use when computing the affinity matrix via the Radial Basis Function nmmds: initial_coords - a set of coordinates to refine. NMMDS works very badly without this spectral: affinity_matrix, sigma unit_length - scale the coordinates to unit length, so points sit on the surface of the unit sphere :param dimensions: (int) number of coordinate axes to use :param method: (string) one of cmds, kpca, mmds, nmmds, spectral :param kwargs: unit_length (bool), affinity_matrix (np.array), sigma (float), initial_coords (np.array) :return: coordinate matrix (np.array) """ optioncheck(method, ['cmds', 'kpca', 'mmds', 'nmmds', 'spectral']) if method == 'cmds': return self._embedding_classical_mds(dimensions) elif method == 'kpca': return self._embedding_kernel_pca(dimensions, **kwargs) elif method == 'mmds': return self._embedding_metric_mds(dimensions) elif method == 'nmmds': return self._embedding_nonmetric_mds(dimensions, **kwargs) elif method == 'spectral': return self._embedding_spectral(dimensions, **kwargs)
def autocorrelated_relaxed_clock(self, root_rate, autocorrel, distribution='lognormal'): """ Attaches rates to each node according to autocorrelated lognormal model from Kishino et al.(2001), or autocorrelated exponential """ optioncheck(distribution, ['exponential', 'lognormal']) if autocorrel == 0: for node in self.preorder_node_iter(): node.rate = root_rate return for node in self.preorder_node_iter(): if node == self.seed_node: node.rate = root_rate else: parent_rate = node.parent_node.rate bl = node.edge_length if distribution == 'lognormal': node.rate = logn_correlated_rate(parent_rate, bl, autocorrel) else: node.rate = np.random.exponential(parent_rate)
def __init__( self, collection, analysis, lsf=False, max_guidetrees=10, tmpdir=None, datatype=None, verbosity=0, populate_cache=True, debug=False, ): optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection']) if analysis == 'tc': self.analysis = 'TreeCollection' else: self.analysis = analysis self.max_guidetrees = max_guidetrees self.lsf = lsf self.collection = collection self.datatype = datatype or collection.datatype self.verbosity = verbosity optioncheck(self.datatype, ['protein', 'dna']) self.tmpdir = tmpdir or collection.tmpdir directorymake(self.tmpdir) self.cache = {} self.history = [] self.debug=debug if populate_cache: self.populate_cache()
def __init__( self, class_list, permutations_list, nspecies, subst_model, rate_model, master_tree_generator_method="yule", master_tree=None, class_tree_permuter="nni", gene_length_kappa=1.7719, gene_length_theta=279.9, gene_length_min=10, gamma_rate_param=None, outdir="./", autocorrelated_relaxed_clock=False, uncorrelated_relaxed_clock=False, scale_rates=False, verbosity=0, ): # default errors.optioncheck(master_tree_generator_method, ["yule", "coal", "rtree", "custom"]) errors.optioncheck(class_tree_permuter, ["nni", "spr", "lgt", "genetree"]) if master_tree is None and master_tree_generator_method == "custom": raise Exception("No custom tree was specified") self.num_classes = len(class_list) self.num_genes = sum(class_list) self.class_list = class_list self._master_tree = None self.verbosity = verbosity self.autocorrelated_relaxed_clock = autocorrelated_relaxed_clock self.uncorrelated_relaxed_clock = uncorrelated_relaxed_clock self.scale_rates = scale_rates self.gene_trees = list() if master_tree is None: tree = self.generate_master_tree(master_tree_generator_method, nspecies) self.master_tree = tree self.num_species = nspecies else: self.master_tree = master_tree if len(master_tree) != nspecies: msg = [ "Warning: supplied tree has {0} taxa.".format(len(master_tree)), "Required number is {0}.\n".format(nspecies), "Resetting number of species to match the supplied tree.", ] print("".join(msg)) self.num_species = nspecies self.set_gene_lengths(gene_length_kappa, gene_length_theta, gene_length_min) self.gamma_rate_param = gamma_rate_param self.permuter = class_tree_permuter self.permutations_list = permutations_list self.datatype = datatype self.tmpdir = errors.directorymake(tmpdir) self.outdir = outdir self.generate_class_trees() # sets self.class_trees dict self.make_alf_dirs() # sets self.alf_dirs dict self.write_alf_params() self.get_true_partition()
def uncorrelated_relaxed_clock(self, root_rate, variance, distribution='lognormal'): optioncheck(distribution, ['exponential', 'lognormal']) for node in self.preorder_node_iter(): if node == self.seed_node: node.rate = root_rate else: if distribution == 'lognormal': mu = np.log(root_rate) - 0.5 * variance node.rate = np.random.lognormal(mu, variance) else: node.rate = np.random.exponential(root_rate)
def embedding_plotter( self, coordinates, dimensions, partition=None, add_sphere=False, xlab='PCo1', ylab='PCo2', zlab='PCo3', title='Trees embedded in dimension-reduced space', outfile=False, ): """ Points are coloured according to cluster membership specified by Partition object (or all black if no Partition specified) """ optioncheck(dimensions, [2, 3]) partition = (partition or Partition(tuple([0] * len(coordinates)))) colours = zip( *zip(range(len(partition)), itertools.cycle('bgrcmyk')))[1] print(colours) colour_mapping = np.array( [colours[i - 1] for i in partition.partition_vector]) fig = plt.figure() if dimensions == 3: ax = fig.add_subplot(111, projection='3d', xlabel=xlab, ylabel=ylab, zlabel=zlab, title=title) if add_sphere: ax = self.sphere(ax) else: ax = fig.add_subplot(111, xlabel=xlab, ylabel=ylab, title=title) ax.scatter(*coordinates.T, color=colour_mapping) # ax.set_aspect(1) if outfile: fig.savefig('{0}.pdf'.format(outfile)) return fig
def __init__( self, records=None, input_dir=None, trees_dir=None, file_format='fasta', datatype=None, tmpdir=TMPDIR, calc_distances=False, compression=None, debug=False, ): self.tmpdir = directorymake(tmpdir) self._records = None self.debug = debug if records: self.records = records self.datatype = datatype or records[0].datatype optioncheck(self.datatype, ['dna', 'protein']) for rec in records: rec.tmpdir = self.tmpdir elif input_dir: directorycheck(input_dir) self.datatype = optioncheck(datatype, ['dna', 'protein']) optioncheck(file_format, ['fasta', 'phylip']) self.records = self.read_alignments(input_dir, file_format, compression) else: raise Exception('Provide a list of records, ' 'or the path to a set of alignments') self.taxon_set = TaxonSet() if trees_dir: self.read_trees(trees_dir, self.taxon_set) if not self.records: raise NoRecordsError(file_format, input_dir, compression) if calc_distances: self.calc_distances()
def __init__(self, nclusters, collection, tmpdir=TMPDIR, analysis='nj', initial_assignment=None, scorer=None): optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection']) if self.analysis == 'tc': self.analysis = 'TreeCollection' else: self.analysis = analysis self.Collection = collection if not self.Collection.records[0].tree: print('Calculating {} trees for collection...'.format(analysis)) self.Collection.calc_NJ_trees() self.datatype = collection.datatype if scorer is not None and isinstance(scorer, Scorer): self.scorer = scorer else: self.scorer = Scorer(self.Collection.records, analysis=analysis, datatype=self.datatype, tmpdir=tmpdir) self.nclusters = nclusters self.tmpdir = tmpdir print('Calculating initial scores...') if initial_assignment is None: initial_assignment = Partition(tuple([0]*len(collection))) # initial_assignment = self.random_partition(nclusters) self.global_best_scores = {} self.global_best_assignments = {} self.global_best_scores[self.nclusters] = self.scorer.score( initial_assignment, history=True) self.global_best_assignments[self.nclusters] = initial_assignment self.done_worse = 0 self.stayed_put = 0 self.i = 0 self.resets = 0 self.merges = 0
def __new__( cls, trees, metric, tmpdir=TMPDIR, dtype=float, add_noise=False, normalise=False, dec_places=None, lsf=False, ): optioncheck(metric, ['euc', 'geo', 'rf', 'wrf']) input_array = get_distance_matrix(trees, metric, tmpdir, normalise=normalise, dec_places=dec_places, lsf=lsf) obj = np.asarray(input_array, dtype).view(cls) obj.metric = metric obj.tmpdir = tmpdir if add_noise: obj = obj.add_noise() return obj
def make_new_assignment(self, sample, scores, assignment, nreassign=1, choose='max'): """ MAKES A NEW PARTITION BY REASSIGNING RECORDS BETWEEN CLUSTERS """ optioncheck(choose, ('max', 'min')) new_clusters = scores.argmax(axis=1) M = scores / scores.sum(axis=1)[:, np.newaxis] if choose == 'max': reassignments = M.max(axis=1).argsort()[-nreassign:] else: reassignments = M.min(axis=1).argsort()[:nreassign] new_assignment = list(assignment.partition_vector) for i in reassignments: new_assignment[sample[i]] = new_clusters[i] + 1 # because cluster number is in range # [1,x], and new_clusters is in range [0,x-1] return Partition(tuple(new_assignment))
def __init__(self, nclusters, collection, tmpdir=TMPDIR, analysis='nj', initial_assignment=None, scorer=None): optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection']) if self.analysis == 'tc': self.analysis = 'TreeCollection' else: self.analysis = analysis self.Collection = collection if not self.Collection.records[0].tree: print('Calculating {} trees for collection...'.format(analysis)) self.Collection.calc_NJ_trees() self.datatype = collection.datatype if scorer is not None and isinstance(scorer, Scorer): self.scorer = scorer else: self.scorer = Scorer(self.Collection) self.nclusters = nclusters self.tmpdir = tmpdir print('Calculating initial scores...') if initial_assignment is None: initial_assignment = Partition(tuple([0] * len(collection))) # initial_assignment = self.random_partition(nclusters) self.global_best_scores = {} self.global_best_assignments = {} self.global_best_scores[self.nclusters] = self.scorer.score( initial_assignment, history=True) self.global_best_assignments[self.nclusters] = initial_assignment self.done_worse = 0 self.stayed_put = 0 self.i = 0 self.resets = 0 self.merges = 0
def calc_phyml_trees(self, analysis='nj', lsf=False, strategy='dynamic', minmem=256, bootstraps=None, add_originals=False, verbosity=0): """ Calculates trees for each record using phyml """ optioncheck(analysis, ANALYSES) if bootstraps is not None: bootstraps = int(isnumbercheck(bootstraps)) records = list(itertools.chain(*[[r.bootstrap_sample(str(i)) for i in range(bootstraps)] for r in self])) if add_originals: records.extend(self.records) else: records = self.records if lsf: trees = runLSFPhyml(records, self.tmpdir, analysis=analysis, verbosity=verbosity, strategy=strategy, minmem=minmem, taxon_set=self.taxon_set, debug=self.debug) for rec, tree in zip(records, trees): rec.tree = TrClTree.cast(tree) else: for rec in records: runPhyml(rec, self.tmpdir, analysis=analysis, verbosity=verbosity, taxon_set=self.taxon_set) rec.tree = TrClTree.cast(rec.tree) if verbosity == 1: print() if bootstraps is not None: return [r.tree for r in records]
def simulate(self, index_tuple, model=None, lsf=False, ntimes=1): """ Simulate a group of sequence alignments using ALF. Uses one of {(GCB, JTT, LG, WAG - protein), (CPAM, ECM and ECMu - DNA)}, WAG by default. TO DO: add parameterised models when I have a robust (probably PAML) method of estimating them from alignment+tree """ if self.datatype == 'protein': # set some defaults model = model or 'WAG' optioncheck(model, [ 'CPAM', 'ECM', 'ECMu', 'WAG', 'JTT', 'GCB', 'LG', ]) else: model = model or 'GTR' try: optioncheck(model, ['CPAM', 'ECM', 'ECMu', 'GTR']) except OptionError, e: print('Choose a DNA-friendly model for simulation:\n', e) return
def read_alignments(self, input_dir, file_format, compression=None): """ Get list of alignment files from an input directory *.fa, *.fas and *.phy files only Stores in self.files """ optioncheck(compression, [None, 'gz', 'bz2']) if file_format == 'fasta': extensions = ['fa', 'fas', 'fasta'] elif file_format == 'phylip': extensions = ['phy'] if compression: extensions = ['.'.join([x, compression]) for x in extensions] files = fileIO.glob_by_extensions(input_dir, extensions) files.sort(key=SORT_KEY) return [TrClSeq(f, file_format=file_format, datatype=self.datatype, name=fileIO.strip_extensions(f), tmpdir=self.tmpdir) for f in files]
def decomp_to_coords(self, decomp, dimensions, normalise=False): optioncheck(dimensions, [2, 3]) coords = decomp.coords_by_dimension(dimensions)[0] return coords.normalise_rows() if normalise else coords
def __init__( self, class_list, permutations_list, nspecies, tmpdir, datatype='protein', master_tree_generator_method='yule', master_tree=None, class_tree_permuter='nni', gene_length_kappa=1.7719, gene_length_theta=279.9, gene_length_min=10, gamma_rate_param=None, outdir='./', autocorrelated_relaxed_clock=False, uncorrelated_relaxed_clock=False, scale_rates=False, verbosity=0, ): # default errors.optioncheck(master_tree_generator_method, ['yule', 'coal', 'rtree', 'custom']) errors.optioncheck(class_tree_permuter, ['nni', 'spr', 'lgt', 'genetree' ]) if master_tree is None and master_tree_generator_method == 'custom': raise Exception('No custom tree was specified') self.num_classes = len(class_list) self.num_genes = sum(class_list) self.class_list = class_list self.verbosity=verbosity self.autocorrelated_relaxed_clock = autocorrelated_relaxed_clock self.uncorrelated_relaxed_clock = uncorrelated_relaxed_clock self.scale_rates = scale_rates self.gene_trees = list() if master_tree is None: tree = self.generate_master_tree(master_tree_generator_method, nspecies) self.master_tree = tree self.num_species = nspecies else: self.master_tree = master_tree if len(master_tree) != nspecies: msg = [ 'Warning: supplied tree has {0} taxa.'.format( len(master_tree)), 'Required number is {0}.\n'.format(nspecies), 'Resetting number of species to match the supplied tree.' ] print(''.join(msg)) self.num_species = nspecies self.set_gene_lengths(gene_length_kappa, gene_length_theta, gene_length_min) self.gamma_rate_param = gamma_rate_param self.permuter = class_tree_permuter self.permutations_list = permutations_list self.datatype = datatype self.tmpdir = errors.directorymake(tmpdir) self.outdir = outdir self.generate_class_trees() # sets self.class_trees dict self.make_alf_dirs() # sets self.alf_dirs dict self.write_alf_params() self.get_true_partition()
def __init__( self, class_list, permutations_list, nspecies, subst_model, rate_model, master_tree_generator_method='yule', master_tree=None, class_tree_permuter='nni', gene_length_kappa=1.7719, gene_length_theta=279.9, gene_length_min=10, gamma_rate_param=None, outdir='./', autocorrelated_relaxed_clock=False, uncorrelated_relaxed_clock=False, scale_rates=False, verbosity=0, ): # default errors.optioncheck(master_tree_generator_method, ['yule', 'coal', 'rtree', 'custom']) errors.optioncheck(class_tree_permuter, ['nni', 'spr', 'lgt', 'genetree']) if master_tree is None and master_tree_generator_method == 'custom': raise Exception('No custom tree was specified') self.num_classes = len(class_list) self.num_genes = sum(class_list) self.class_list = class_list self._master_tree = None self.verbosity = verbosity self.autocorrelated_relaxed_clock = autocorrelated_relaxed_clock self.uncorrelated_relaxed_clock = uncorrelated_relaxed_clock self.scale_rates = scale_rates self.gene_trees = list() if master_tree is None: tree = self.generate_master_tree(master_tree_generator_method, nspecies) self.master_tree = tree self.num_species = nspecies else: self.master_tree = master_tree if len(master_tree) != nspecies: msg = [ 'Warning: supplied tree has {0} taxa.'.format( len(master_tree)), 'Required number is {0}.\n'.format(nspecies), 'Resetting number of species to match the supplied tree.' ] print(''.join(msg)) self.num_species = nspecies self.set_gene_lengths(gene_length_kappa, gene_length_theta, gene_length_min) self.gamma_rate_param = gamma_rate_param self.permuter = class_tree_permuter self.permutations_list = permutations_list self.datatype = datatype self.tmpdir = errors.directorymake(tmpdir) self.outdir = outdir self.generate_class_trees() # sets self.class_trees dict self.make_alf_dirs() # sets self.alf_dirs dict self.write_alf_params() self.get_true_partition()
def read_alignments(self, input_dir, file_format, header_grep=None, compression=None): """ Get list of alignment files from an input directory *.fa, *.fas and *.phy files only Stores in self.files """ optioncheck(compression, [None, 'gz', 'bz2']) if file_format == 'fasta': extensions = ['fa', 'fas', 'fasta'] elif file_format == 'phylip': extensions = ['phy'] else: extensions = [] if compression: extensions = ['.'.join([x, compression]) for x in extensions] files = fileIO.glob_by_extensions(input_dir, extensions) files.sort(key=SORT_KEY) self._input_files = files records = [] pbar = setup_progressbar("Loading files", len(files), simple_progress=True) pbar.start() for i, f in enumerate(files): if compression is not None: with fileIO.TempFile() as tmpfile: with fileIO.freader(f, compression) as reader, fileIO.fwriter(tmpfile) as writer: for line in reader: writer.write(line) try: record = Alignment(tmpfile, file_format, True) except RuntimeError: record = Alignment(tmpfile, file_format, False) else: try: record = Alignment(f, file_format, True) except RuntimeError: record = Alignment(f, file_format, False) if header_grep: try: datatype = 'dna' if record.is_dna() else 'protein' record = Alignment([(header_grep(x), y) for (x, y) in record.get_sequences()], datatype) except TypeError: raise TypeError("Couldn't apply header_grep to header\n" "alignment number={}, name={}\n" "header_grep={}".format(i, fileIO.strip_extensions(f), header_grep)) except RuntimeError: print('RuntimeError occurred processing alignment number={}, name={}' .format(i, fileIO.strip_extensions(f))) raise record.name = (fileIO.strip_extensions(f)) records.append(record) pbar.update(i) pbar.finish() return records
def read_alignments(self, input_dir, file_format, header_grep=None, compression=None): """ Get list of alignment files from an input directory *.fa, *.fas and *.phy files only Stores in self.files """ optioncheck(compression, [None, 'gz', 'bz2']) if file_format == 'fasta': extensions = ['fa', 'fas', 'fasta'] elif file_format == 'phylip': extensions = ['phy'] else: extensions = [] if compression: extensions = ['.'.join([x, compression]) for x in extensions] files = fileIO.glob_by_extensions(input_dir, extensions) files.sort(key=SORT_KEY) self._input_files = files records = [] pbar = setup_progressbar("Loading files", len(files), simple_progress=True) pbar.start() for i, f in enumerate(files): if compression is not None: with fileIO.TempFile() as tmpfile: with fileIO.freader(f, compression) as reader, fileIO.fwriter( tmpfile) as writer: for line in reader: writer.write(line) try: record = Alignment(tmpfile, file_format, True) except RuntimeError: record = Alignment(tmpfile, file_format, False) else: try: record = Alignment(f, file_format, True) except RuntimeError: record = Alignment(f, file_format, False) if header_grep: try: datatype = 'dna' if record.is_dna() else 'protein' record = Alignment([(header_grep(x), y) for (x, y) in record.get_sequences()], datatype) except TypeError: raise TypeError("Couldn't apply header_grep to header\n" "alignment number={}, name={}\n" "header_grep={}".format( i, fileIO.strip_extensions(f), header_grep)) except RuntimeError: print( 'RuntimeError occurred processing alignment number={}, name={}' .format(i, fileIO.strip_extensions(f))) raise record.name = (fileIO.strip_extensions(f)) records.append(record) pbar.update(i) pbar.finish() return records