def add_minsq_partitions(self, partitions, batchsize=1, background=False): if isinstance(partitions, Partition): partitions = (partitions, ) index_tuples = set(ix for partition in partitions for ix in partition.get_membership()).difference( self.minsq_cache.keys()) if len(index_tuples) == 0: return # Collect argument list args = [] for ix in index_tuples: conc = self.concatenate(ix) args.append(conc.get_tree_collection_strings()) # Distribute work msg = 'Adding MinSq cluster trees' client = get_client() if client is None: map_result = sequential_map(tasks.minsq_task, args, msg) else: map_result = parallel_map(client, tasks.minsq_task, args, msg, batchsize, background) if background: return map_result # Process results pbar = setup_progressbar('Processing results', len(map_result)) pbar.start() for i, (ix, result) in enumerate(zip(index_tuples, map_result)): self.minsq_cache[ix] = result pbar.update(i) pbar.finish()
def add_minsq_partitions(self, partitions, batchsize=1, background=False): if isinstance(partitions, Partition): partitions = (partitions,) index_tuples = set(ix for partition in partitions for ix in partition.get_membership()).difference( self.minsq_cache.keys()) if len(index_tuples) == 0: return # Collect argument list args = [] for ix in index_tuples: conc = self.concatenate(ix) args.append(conc.get_tree_collection_strings()) # Distribute work msg = 'Adding MinSq cluster trees' client = get_client() if client is None: map_result = sequential_map(tasks.minsq_task, args, msg) else: map_result = parallel_map(client, tasks.minsq_task, args, msg, batchsize, background) if background: return map_result # Process results pbar = setup_progressbar('Processing results', len(map_result)) pbar.start() for i, (ix, result) in enumerate(zip(index_tuples, map_result)): self.minsq_cache[ix] = result pbar.update(i) pbar.finish()
def _generic_matrix_calc(fn, trees, normalise, min_overlap=4): """(fn, trees, normalise) Calculates all pairwise distances between trees given in the parameter 'trees'. Distance functions: eucdist_matrix geodist_matrix rfdist_matrix wrfdist_matrix These wrap the leafset-checking functions. If the faster non-leafset-checking functions are needed, do this: scipy.spatial.distance(['getDistance'(t1.phylotree, t2.phylotree, normalise) for (t1, t2) in itertools.combinations(trees, 2)]) for your choice of 'getDistance' out of: getEuclideanDistance getGeodesicDistance getRobinsonFouldsDistance getWeightedRobinsonFouldsDistance :param trees: list or tuple, or some other iterable container type containing Tree objects :param normalise: boolean :param min_overlap: int :return: numpy.array """ jobs = itertools.combinations(trees, 2) results = [] pbar = setup_progressbar('Calculating tree distances', 0.5 * len(trees) * (len(trees) - 1)) pbar.start() for i, (t1, t2) in enumerate(jobs): results.append(_generic_distance_calc(fn, t1, t2, normalise, min_overlap)) pbar.update(i) pbar.finish() return scipy.spatial.distance.squareform(results)
def calc_trees(self, model=None, threads=1, indices=None, batchsize=1, output_dir=None, background=False): """ Use pllpy to calculate maximum-likelihood trees :return: None (all side effects) """ # Assemble argument lists if indices is None: indices = list(range(len(self))) args = [] to_delete = [] for i in indices: rec = self[i] filename, delete = rec.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) if model is None: model = ('DNA' if rec.is_dna() else 'LGX') if model == 'AUTOX': model = 'AUTO' partition = '{}, {} = 1 - {}'.format(model, rec.name, len(rec)) tree = rec.parameters.nj_tree if rec.parameters.nj_tree is not None else True if output_dir is not None and os.path.isdir(output_dir): output_file = os.path.join(output_dir, '{}.json'.format(rec.name)) curr_args = (filename, partition, tree, threads, PLL_RANDOM_SEED, None, output_file) else: curr_args = (filename, partition, tree, threads, PLL_RANDOM_SEED) args.append(curr_args) # Dispatch work msg = 'Calculating ML trees' client = get_client() if client is None: map_result = sequential_map(tasks.pll_task, args, msg) else: map_result = parallel_map(client, tasks.pll_task, args, msg, batchsize, background) if background: return map_result # Process results with fileIO.TempFileList(to_delete): pbar = setup_progressbar('Processing results', len(map_result)) j = 0 pbar.start() for i, result in zip(indices, map_result): rec = self[i] rec.parameters.construct_from_dict(result) pbar.update(j + 1) j += 1
def add_lnl_partitions(self, partitions, threads=1, use_calculated_freqs=True, batchsize=1, background=False): self.add_minsq_partitions(partitions) if isinstance(partitions, Partition): partitions = (partitions, ) index_tuples = set(ix for partition in partitions for ix in partition.get_membership()).difference( self.lnl_cache.keys()) if len(index_tuples) == 0: return # Collect argument list args = [] to_delete = [] for ix in index_tuples: conc = self.concatenate(ix) al = conc.alignment filename, delete = al.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) partition = conc.qfile(dna_model="GTR", protein_model="LG", ml_freqs=True) tree = self.minsq_cache[ix]['tree'] if use_calculated_freqs: args.append((filename, partition, tree, threads, PLL_RANDOM_SEED, conc.frequencies)) else: args.append((filename, partition, tree, threads, PLL_RANDOM_SEED, None)) # Distribute work with fileIO.TempFileList(to_delete): msg = 'Adding ML cluster trees' client = get_client() if client is None: map_result = sequential_map(tasks.pll_task, args, msg) else: map_result = parallel_map(client, tasks.pll_task, args, msg, batchsize, background) if background: return map_result # Process results pbar = setup_progressbar('Processing results', len(map_result)) pbar.start() for i, (ix, result) in enumerate(zip(index_tuples, map_result)): self.lnl_cache[ix] = result pbar.update(i) pbar.finish()
def calc_distances(self, batchsize=1, background=False): """ Calculate fast approximate intra-alignment pairwise distances and variances using ML (requires ML models to have been set up using `calc_trees`). :return: None (all side effects) """ # Assemble argument lists args = [] to_delete = [] for rec in self: filename, delete = rec.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) # Get input dict model = {'partitions': {}} data = { 'alpha': rec.parameters.partitions.alpha, 'frequencies': rec.parameters.partitions.frequencies } if rec.is_dna(): data['rates'] = rec.parameters.partitions.rates model['partitions'][0] = data args.append((model, filename)) # Dispatch msg = 'Calculating ML distances' client = get_client() if client is None: map_result = sequential_map(tasks.calc_distances_task, args, msg) else: map_result = parallel_map(client, tasks.calc_distances_task, args, msg, batchsize, background) if background: return map_result # Process results with fileIO.TempFileList(to_delete): pbar = setup_progressbar('Processing results', len(map_result)) j = 0 pbar.start() for i, result in enumerate(map_result): rec = self[i] rec.parameters.partitions.distances = result['partitions'][0][ 'distances'] rec.parameters.partitions.variances = result['partitions'][0][ 'variances'] rec.parameters.nj_tree = result['nj_tree'] pbar.update(j + 1) j += 1
def calc_trees(self, model=None, threads=1, indices=None, batchsize=1, output_dir=None, background=False): """ Use pllpy to calculate maximum-likelihood trees :return: None (all side effects) """ # Assemble argument lists if indices is None: indices = list(range(len(self))) args = [] to_delete = [] for i in indices: rec = self[i] filename, delete = rec.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) if model is None: model = ('DNA' if rec.is_dna() else 'LGX') if model == 'AUTOX': model = 'AUTO' partition = '{}, {} = 1 - {}'.format(model, rec.name, len(rec)) tree = rec.parameters.nj_tree if rec.parameters.nj_tree is not None else True if output_dir is not None and os.path.isdir(output_dir): output_file = os.path.join(output_dir, '{}.json'.format(rec.name)) curr_args = (filename, partition, tree, threads, PLL_RANDOM_SEED, None, output_file) else: curr_args = (filename, partition, tree, threads, PLL_RANDOM_SEED) args.append(curr_args) # Dispatch work msg = 'Calculating ML trees' client = get_client() if client is None: map_result = sequential_map(tasks.pll_task, args, msg) else: map_result = parallel_map(client, tasks.pll_task, args, msg, batchsize, background) if background: return map_result # Process results with fileIO.TempFileList(to_delete): pbar = setup_progressbar('Processing results', len(map_result)) j = 0 pbar.start() for i, result in zip(indices, map_result): rec = self[i] rec.parameters.construct_from_dict(result) pbar.update(j+1) j += 1
def fast_calc_distances(self, batchsize=1, background=False): """ Calculate fast approximate intra-alignment pairwise distances and variances using Jukes-Cantor closed formulae. :return: None (all side effects) """ # Assemble argument lists args = [] to_delete = [] for rec in self: filename, delete = rec.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) args.append((filename, )) # Dispatch work (either sequentially or in parallel) msg = 'Calculating fast distances' with fileIO.TempFileList(to_delete): client = get_client() if client is None: map_result = sequential_map(tasks.fast_calc_distances_task, args, msg) else: map_result = parallel_map(client, tasks.fast_calc_distances_task, args, msg, batchsize, background) if background: return map_result # Process results pbar = setup_progressbar('Processing results', len(map_result)) j = 0 pbar.start() for i, result in enumerate(map_result): rec = self[i] distances = result['distances'] variances = result['variances'] tree = result['tree'] rec.parameters.nj_tree = tree params = rec.parameters.partitions if params is None: params = PartitionParameters() rec.parameters.partitions = [params] params.distances = distances params.variances = variances pbar.update(i) pbar.finish()
def fast_calc_distances(self, batchsize=1, background=False): """ Calculate fast approximate intra-alignment pairwise distances and variances using Jukes-Cantor closed formulae. :return: None (all side effects) """ # Assemble argument lists args = [] to_delete = [] for rec in self: filename, delete = rec.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) args.append((filename,)) # Dispatch work (either sequentially or in parallel) msg = 'Calculating fast distances' with fileIO.TempFileList(to_delete): client = get_client() if client is None: map_result = sequential_map(tasks.fast_calc_distances_task, args, msg) else: map_result = parallel_map(client, tasks.fast_calc_distances_task, args, msg, batchsize, background) if background: return map_result # Process results pbar = setup_progressbar('Processing results', len(map_result)) j = 0 pbar.start() for i, result in enumerate(map_result): rec = self[i] distances = result['distances'] variances = result['variances'] tree = result['tree'] rec.parameters.nj_tree = tree params = rec.parameters.partitions if params is None: params = PartitionParameters() rec.parameters.partitions = [params] params.distances = distances params.variances = variances pbar.update(i) pbar.finish()
def add_lnl_partitions(self, partitions, threads=1, use_calculated_freqs=True, batchsize=1, background=False): self.add_minsq_partitions(partitions) if isinstance(partitions, Partition): partitions = (partitions,) index_tuples = set(ix for partition in partitions for ix in partition.get_membership()).difference( self.lnl_cache.keys()) if len(index_tuples) == 0: return # Collect argument list args = [] to_delete = [] for ix in index_tuples: conc = self.concatenate(ix) al = conc.alignment filename, delete = al.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) partition = conc.qfile(dna_model="GTR", protein_model="LG", ml_freqs=True) tree = self.minsq_cache[ix]['tree'] if use_calculated_freqs: args.append((filename, partition, tree, threads, PLL_RANDOM_SEED, conc.frequencies)) else: args.append((filename, partition, tree, threads, PLL_RANDOM_SEED, None)) # Distribute work with fileIO.TempFileList(to_delete): msg = 'Adding ML cluster trees' client = get_client() if client is None: map_result = sequential_map(tasks.pll_task, args, msg) else: map_result = parallel_map(client, tasks.pll_task, args, msg, batchsize, background) if background: return map_result # Process results pbar = setup_progressbar('Processing results', len(map_result)) pbar.start() for i, (ix, result) in enumerate(zip(index_tuples, map_result)): self.lnl_cache[ix] = result pbar.update(i) pbar.finish()
def calc_distances(self, batchsize=1, background=False): """ Calculate fast approximate intra-alignment pairwise distances and variances using ML (requires ML models to have been set up using `calc_trees`). :return: None (all side effects) """ # Assemble argument lists args = [] to_delete = [] for rec in self: filename, delete = rec.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) # Get input dict model = {'partitions': {}} data = {'alpha': rec.parameters.partitions.alpha, 'frequencies': rec.parameters.partitions.frequencies} if rec.is_dna(): data['rates'] = rec.parameters.partitions.rates model['partitions'][0] = data args.append((model, filename)) # Dispatch msg = 'Calculating ML distances' client = get_client() if client is None: map_result = sequential_map(tasks.calc_distances_task, args, msg) else: map_result = parallel_map(client, tasks.calc_distances_task, args, msg, batchsize, background) if background: return map_result # Process results with fileIO.TempFileList(to_delete): pbar = setup_progressbar('Processing results', len(map_result)) j = 0 pbar.start() for i, result in enumerate(map_result): rec = self[i] rec.parameters.partitions.distances = result['partitions'][0]['distances'] rec.parameters.partitions.variances = result['partitions'][0]['variances'] rec.parameters.nj_tree = result['nj_tree'] pbar.update(j+1) j += 1
def read_parameters(self, input_dir): """ Read a directory full of tree files, matching them up to the already loaded alignments """ pbar = setup_progressbar("Loading parameters", len(self.records)) pbar.start() for i, rec in enumerate(self.records): hook = os.path.join(input_dir, '{}.json*'.format(rec.name)) filename = glob.glob(hook) try: with fileIO.freader(filename[0]) as infile: d = json.load(infile, parse_int=True) rec.parameters.construct_from_dict(d) except IOError, IndexError: continue finally:
def read_alignments(self, input_dir, file_format, header_grep=None, compression=None): """ Get list of alignment files from an input directory *.fa, *.fas and *.phy files only Stores in self.files """ optioncheck(compression, [None, 'gz', 'bz2']) if file_format == 'fasta': extensions = ['fa', 'fas', 'fasta'] elif file_format == 'phylip': extensions = ['phy'] else: extensions = [] if compression: extensions = ['.'.join([x, compression]) for x in extensions] files = fileIO.glob_by_extensions(input_dir, extensions) files.sort(key=SORT_KEY) self._input_files = files records = [] pbar = setup_progressbar("Loading files", len(files), simple_progress=True) pbar.start() for i, f in enumerate(files): if compression is not None: with fileIO.TempFile() as tmpfile: with fileIO.freader(f, compression) as reader, fileIO.fwriter(tmpfile) as writer: for line in reader: writer.write(line) try: record = Alignment(tmpfile, file_format, True) except RuntimeError: record = Alignment(tmpfile, file_format, False) else: try: record = Alignment(f, file_format, True) except RuntimeError: record = Alignment(f, file_format, False) if header_grep: try: datatype = 'dna' if record.is_dna() else 'protein' record = Alignment([(header_grep(x), y) for (x, y) in record.get_sequences()], datatype) except TypeError: raise TypeError("Couldn't apply header_grep to header\n" "alignment number={}, name={}\n" "header_grep={}".format(i, fileIO.strip_extensions(f), header_grep)) except RuntimeError: print('RuntimeError occurred processing alignment number={}, name={}' .format(i, fileIO.strip_extensions(f))) raise record.name = (fileIO.strip_extensions(f)) records.append(record) pbar.update(i) pbar.finish() return records
def read_alignments(self, input_dir, file_format, header_grep=None, compression=None): """ Get list of alignment files from an input directory *.fa, *.fas and *.phy files only Stores in self.files """ optioncheck(compression, [None, 'gz', 'bz2']) if file_format == 'fasta': extensions = ['fa', 'fas', 'fasta'] elif file_format == 'phylip': extensions = ['phy'] else: extensions = [] if compression: extensions = ['.'.join([x, compression]) for x in extensions] files = fileIO.glob_by_extensions(input_dir, extensions) files.sort(key=SORT_KEY) self._input_files = files records = [] pbar = setup_progressbar("Loading files", len(files), simple_progress=True) pbar.start() for i, f in enumerate(files): if compression is not None: with fileIO.TempFile() as tmpfile: with fileIO.freader(f, compression) as reader, fileIO.fwriter( tmpfile) as writer: for line in reader: writer.write(line) try: record = Alignment(tmpfile, file_format, True) except RuntimeError: record = Alignment(tmpfile, file_format, False) else: try: record = Alignment(f, file_format, True) except RuntimeError: record = Alignment(f, file_format, False) if header_grep: try: datatype = 'dna' if record.is_dna() else 'protein' record = Alignment([(header_grep(x), y) for (x, y) in record.get_sequences()], datatype) except TypeError: raise TypeError("Couldn't apply header_grep to header\n" "alignment number={}, name={}\n" "header_grep={}".format( i, fileIO.strip_extensions(f), header_grep)) except RuntimeError: print( 'RuntimeError occurred processing alignment number={}, name={}' .format(i, fileIO.strip_extensions(f))) raise record.name = (fileIO.strip_extensions(f)) records.append(record) pbar.update(i) pbar.finish() return records