def add_minsq_partitions(self, partitions, batchsize=1, background=False): if isinstance(partitions, Partition): partitions = (partitions, ) index_tuples = set(ix for partition in partitions for ix in partition.get_membership()).difference( self.minsq_cache.keys()) if len(index_tuples) == 0: return # Collect argument list args = [] for ix in index_tuples: conc = self.concatenate(ix) args.append(conc.get_tree_collection_strings()) # Distribute work msg = 'Adding MinSq cluster trees' client = get_client() if client is None: map_result = sequential_map(tasks.minsq_task, args, msg) else: map_result = parallel_map(client, tasks.minsq_task, args, msg, batchsize, background) if background: return map_result # Process results pbar = setup_progressbar('Processing results', len(map_result)) pbar.start() for i, (ix, result) in enumerate(zip(index_tuples, map_result)): self.minsq_cache[ix] = result pbar.update(i) pbar.finish()
def add_minsq_partitions(self, partitions, batchsize=1, background=False): if isinstance(partitions, Partition): partitions = (partitions,) index_tuples = set(ix for partition in partitions for ix in partition.get_membership()).difference( self.minsq_cache.keys()) if len(index_tuples) == 0: return # Collect argument list args = [] for ix in index_tuples: conc = self.concatenate(ix) args.append(conc.get_tree_collection_strings()) # Distribute work msg = 'Adding MinSq cluster trees' client = get_client() if client is None: map_result = sequential_map(tasks.minsq_task, args, msg) else: map_result = parallel_map(client, tasks.minsq_task, args, msg, batchsize, background) if background: return map_result # Process results pbar = setup_progressbar('Processing results', len(map_result)) pbar.start() for i, (ix, result) in enumerate(zip(index_tuples, map_result)): self.minsq_cache[ix] = result pbar.update(i) pbar.finish()
def _get_inter_tree_distances(metric, trees, normalise=False, batchsize=100, background=False): # Assemble argument lists args = [(t1, t2, normalise) for (t1, t2) in itertools.combinations(trees, 2)] # Get task tasks_dict = dict( zip(['euc', 'geo', 'rf', 'wrf'], [ tasks.eucdist_task, tasks.geodist_task, tasks.rfdist_task, tasks.wrfdist_task ])) task = tasks_dict[metric] # Dispatch msg = 'Inter-tree distances ({})'.format(metric) client = get_client() if client is None: map_result = sequential_map(task, args, msg) else: map_result = parallel_map(client, task, args, msg, batchsize, background) if background: return map_result map_result = list(map_result) return squareform(map_result)
def calc_trees(self, model=None, threads=1, indices=None, batchsize=1, output_dir=None, background=False): """ Use pllpy to calculate maximum-likelihood trees :return: None (all side effects) """ # Assemble argument lists if indices is None: indices = list(range(len(self))) args = [] to_delete = [] for i in indices: rec = self[i] filename, delete = rec.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) if model is None: model = ('DNA' if rec.is_dna() else 'LGX') if model == 'AUTOX': model = 'AUTO' partition = '{}, {} = 1 - {}'.format(model, rec.name, len(rec)) tree = rec.parameters.nj_tree if rec.parameters.nj_tree is not None else True if output_dir is not None and os.path.isdir(output_dir): output_file = os.path.join(output_dir, '{}.json'.format(rec.name)) curr_args = (filename, partition, tree, threads, PLL_RANDOM_SEED, None, output_file) else: curr_args = (filename, partition, tree, threads, PLL_RANDOM_SEED) args.append(curr_args) # Dispatch work msg = 'Calculating ML trees' client = get_client() if client is None: map_result = sequential_map(tasks.pll_task, args, msg) else: map_result = parallel_map(client, tasks.pll_task, args, msg, batchsize, background) if background: return map_result # Process results with fileIO.TempFileList(to_delete): pbar = setup_progressbar('Processing results', len(map_result)) j = 0 pbar.start() for i, result in zip(indices, map_result): rec = self[i] rec.parameters.construct_from_dict(result) pbar.update(j + 1) j += 1
def add_lnl_partitions(self, partitions, threads=1, use_calculated_freqs=True, batchsize=1, background=False): self.add_minsq_partitions(partitions) if isinstance(partitions, Partition): partitions = (partitions, ) index_tuples = set(ix for partition in partitions for ix in partition.get_membership()).difference( self.lnl_cache.keys()) if len(index_tuples) == 0: return # Collect argument list args = [] to_delete = [] for ix in index_tuples: conc = self.concatenate(ix) al = conc.alignment filename, delete = al.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) partition = conc.qfile(dna_model="GTR", protein_model="LG", ml_freqs=True) tree = self.minsq_cache[ix]['tree'] if use_calculated_freqs: args.append((filename, partition, tree, threads, PLL_RANDOM_SEED, conc.frequencies)) else: args.append((filename, partition, tree, threads, PLL_RANDOM_SEED, None)) # Distribute work with fileIO.TempFileList(to_delete): msg = 'Adding ML cluster trees' client = get_client() if client is None: map_result = sequential_map(tasks.pll_task, args, msg) else: map_result = parallel_map(client, tasks.pll_task, args, msg, batchsize, background) if background: return map_result # Process results pbar = setup_progressbar('Processing results', len(map_result)) pbar.start() for i, (ix, result) in enumerate(zip(index_tuples, map_result)): self.lnl_cache[ix] = result pbar.update(i) pbar.finish()
def calc_distances(self, batchsize=1, background=False): """ Calculate fast approximate intra-alignment pairwise distances and variances using ML (requires ML models to have been set up using `calc_trees`). :return: None (all side effects) """ # Assemble argument lists args = [] to_delete = [] for rec in self: filename, delete = rec.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) # Get input dict model = {'partitions': {}} data = { 'alpha': rec.parameters.partitions.alpha, 'frequencies': rec.parameters.partitions.frequencies } if rec.is_dna(): data['rates'] = rec.parameters.partitions.rates model['partitions'][0] = data args.append((model, filename)) # Dispatch msg = 'Calculating ML distances' client = get_client() if client is None: map_result = sequential_map(tasks.calc_distances_task, args, msg) else: map_result = parallel_map(client, tasks.calc_distances_task, args, msg, batchsize, background) if background: return map_result # Process results with fileIO.TempFileList(to_delete): pbar = setup_progressbar('Processing results', len(map_result)) j = 0 pbar.start() for i, result in enumerate(map_result): rec = self[i] rec.parameters.partitions.distances = result['partitions'][0][ 'distances'] rec.parameters.partitions.variances = result['partitions'][0][ 'variances'] rec.parameters.nj_tree = result['nj_tree'] pbar.update(j + 1) j += 1
def calc_trees(self, model=None, threads=1, indices=None, batchsize=1, output_dir=None, background=False): """ Use pllpy to calculate maximum-likelihood trees :return: None (all side effects) """ # Assemble argument lists if indices is None: indices = list(range(len(self))) args = [] to_delete = [] for i in indices: rec = self[i] filename, delete = rec.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) if model is None: model = ('DNA' if rec.is_dna() else 'LGX') if model == 'AUTOX': model = 'AUTO' partition = '{}, {} = 1 - {}'.format(model, rec.name, len(rec)) tree = rec.parameters.nj_tree if rec.parameters.nj_tree is not None else True if output_dir is not None and os.path.isdir(output_dir): output_file = os.path.join(output_dir, '{}.json'.format(rec.name)) curr_args = (filename, partition, tree, threads, PLL_RANDOM_SEED, None, output_file) else: curr_args = (filename, partition, tree, threads, PLL_RANDOM_SEED) args.append(curr_args) # Dispatch work msg = 'Calculating ML trees' client = get_client() if client is None: map_result = sequential_map(tasks.pll_task, args, msg) else: map_result = parallel_map(client, tasks.pll_task, args, msg, batchsize, background) if background: return map_result # Process results with fileIO.TempFileList(to_delete): pbar = setup_progressbar('Processing results', len(map_result)) j = 0 pbar.start() for i, result in zip(indices, map_result): rec = self[i] rec.parameters.construct_from_dict(result) pbar.update(j+1) j += 1
def fast_calc_distances(self, batchsize=1, background=False): """ Calculate fast approximate intra-alignment pairwise distances and variances using Jukes-Cantor closed formulae. :return: None (all side effects) """ # Assemble argument lists args = [] to_delete = [] for rec in self: filename, delete = rec.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) args.append((filename, )) # Dispatch work (either sequentially or in parallel) msg = 'Calculating fast distances' with fileIO.TempFileList(to_delete): client = get_client() if client is None: map_result = sequential_map(tasks.fast_calc_distances_task, args, msg) else: map_result = parallel_map(client, tasks.fast_calc_distances_task, args, msg, batchsize, background) if background: return map_result # Process results pbar = setup_progressbar('Processing results', len(map_result)) j = 0 pbar.start() for i, result in enumerate(map_result): rec = self[i] distances = result['distances'] variances = result['variances'] tree = result['tree'] rec.parameters.nj_tree = tree params = rec.parameters.partitions if params is None: params = PartitionParameters() rec.parameters.partitions = [params] params.distances = distances params.variances = variances pbar.update(i) pbar.finish()
def fast_calc_distances(self, batchsize=1, background=False): """ Calculate fast approximate intra-alignment pairwise distances and variances using Jukes-Cantor closed formulae. :return: None (all side effects) """ # Assemble argument lists args = [] to_delete = [] for rec in self: filename, delete = rec.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) args.append((filename,)) # Dispatch work (either sequentially or in parallel) msg = 'Calculating fast distances' with fileIO.TempFileList(to_delete): client = get_client() if client is None: map_result = sequential_map(tasks.fast_calc_distances_task, args, msg) else: map_result = parallel_map(client, tasks.fast_calc_distances_task, args, msg, batchsize, background) if background: return map_result # Process results pbar = setup_progressbar('Processing results', len(map_result)) j = 0 pbar.start() for i, result in enumerate(map_result): rec = self[i] distances = result['distances'] variances = result['variances'] tree = result['tree'] rec.parameters.nj_tree = tree params = rec.parameters.partitions if params is None: params = PartitionParameters() rec.parameters.partitions = [params] params.distances = distances params.variances = variances pbar.update(i) pbar.finish()
def add_lnl_partitions(self, partitions, threads=1, use_calculated_freqs=True, batchsize=1, background=False): self.add_minsq_partitions(partitions) if isinstance(partitions, Partition): partitions = (partitions,) index_tuples = set(ix for partition in partitions for ix in partition.get_membership()).difference( self.lnl_cache.keys()) if len(index_tuples) == 0: return # Collect argument list args = [] to_delete = [] for ix in index_tuples: conc = self.concatenate(ix) al = conc.alignment filename, delete = al.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) partition = conc.qfile(dna_model="GTR", protein_model="LG", ml_freqs=True) tree = self.minsq_cache[ix]['tree'] if use_calculated_freqs: args.append((filename, partition, tree, threads, PLL_RANDOM_SEED, conc.frequencies)) else: args.append((filename, partition, tree, threads, PLL_RANDOM_SEED, None)) # Distribute work with fileIO.TempFileList(to_delete): msg = 'Adding ML cluster trees' client = get_client() if client is None: map_result = sequential_map(tasks.pll_task, args, msg) else: map_result = parallel_map(client, tasks.pll_task, args, msg, batchsize, background) if background: return map_result # Process results pbar = setup_progressbar('Processing results', len(map_result)) pbar.start() for i, (ix, result) in enumerate(zip(index_tuples, map_result)): self.lnl_cache[ix] = result pbar.update(i) pbar.finish()
def calc_distances(self, batchsize=1, background=False): """ Calculate fast approximate intra-alignment pairwise distances and variances using ML (requires ML models to have been set up using `calc_trees`). :return: None (all side effects) """ # Assemble argument lists args = [] to_delete = [] for rec in self: filename, delete = rec.get_alignment_file(as_phylip=True) if delete: to_delete.append(filename) # Get input dict model = {'partitions': {}} data = {'alpha': rec.parameters.partitions.alpha, 'frequencies': rec.parameters.partitions.frequencies} if rec.is_dna(): data['rates'] = rec.parameters.partitions.rates model['partitions'][0] = data args.append((model, filename)) # Dispatch msg = 'Calculating ML distances' client = get_client() if client is None: map_result = sequential_map(tasks.calc_distances_task, args, msg) else: map_result = parallel_map(client, tasks.calc_distances_task, args, msg, batchsize, background) if background: return map_result # Process results with fileIO.TempFileList(to_delete): pbar = setup_progressbar('Processing results', len(map_result)) j = 0 pbar.start() for i, result in enumerate(map_result): rec = self[i] rec.parameters.partitions.distances = result['partitions'][0]['distances'] rec.parameters.partitions.variances = result['partitions'][0]['variances'] rec.parameters.nj_tree = result['nj_tree'] pbar.update(j+1) j += 1
def simulate(self, partition, outdir, batchsize=1, **kwargs): """ Simulate a set of alignments from the parameters inferred on a partition :param partition: :return: """ indices = partition.get_membership() self.add_lnl_partitions(partition, **kwargs) results = [self.lnl_cache[ix] for ix in indices] places = dict((j, i) for (i, j) in enumerate( rec.name for rec in self.collection.records)) # Collect argument list args = [None] * len(self.collection) for result in results: for partition in result['partitions'].values(): place = places[partition['name']] args[place] = (len(self.collection[place]), model_translate(partition['model']), partition['frequencies'], partition['alpha'], result['ml_tree'], partition['rates'] if 'rates' in partition else None) # Distribute work msg = 'Simulating' client = get_client() if client is None: map_result = sequential_map(client, tasks.simulate_task, args, msg) else: map_result = parallel_map(client, tasks.simulate_task, args, msg, batchsize, background) if background: return map_result # Process results for i, result in enumerate(map_result): orig = self.collection[i] simseqs = gapmask(result, orig.get_sequences()) al = Alignment(simseqs, 'protein' if orig.is_protein() else 'dna') outfile = os.path.join(outdir, orig.name + '.phy') al.write_alignment(outfile, 'phylip', True)
def _get_inter_tree_distances(metric, trees, normalise=False, batchsize=100, background=False): # Assemble argument lists args = [(t1, t2, normalise) for (t1, t2) in itertools.combinations(trees, 2)] # Get task tasks_dict = dict(zip(['euc', 'geo', 'rf', 'wrf'], [tasks.eucdist_task, tasks.geodist_task, tasks.rfdist_task, tasks.wrfdist_task])) task = tasks_dict[metric] # Dispatch msg = 'Inter-tree distances ({})'.format(metric) client = get_client() if client is None: map_result = sequential_map(task, args, msg) else: map_result = parallel_map(client, task, args, msg, batchsize, background) if background: return map_result map_result = list(map_result) return squareform(map_result)
def simulate(self, partition, outdir, batchsize=1, **kwargs): """ Simulate a set of alignments from the parameters inferred on a partition :param partition: :return: """ indices = partition.get_membership() self.add_lnl_partitions(partition, **kwargs) results = [self.lnl_cache[ix] for ix in indices] places = dict((j,i) for (i,j) in enumerate(rec.name for rec in self.collection.records)) # Collect argument list args = [None] * len(self.collection) for result in results: for partition in result['partitions'].values(): place = places[partition['name']] args[place] = (len(self.collection[place]), model_translate(partition['model']), partition['frequencies'], partition['alpha'], result['ml_tree'], partition['rates'] if 'rates' in partition else None) # Distribute work msg = 'Simulating' client = get_client() if client is None: map_result = sequential_map(client, tasks.simulate_task, args, msg) else: map_result = parallel_map(client, tasks.simulate_task, args, msg, batchsize, background) if background: return map_result # Process results for i, result in enumerate(map_result): orig = self.collection[i] simseqs = gapmask(result, orig.get_sequences()) al = Alignment(simseqs, 'protein' if orig.is_protein() else 'dna') outfile = os.path.join(outdir, orig.name + '.phy') al.write_alignment(outfile, 'phylip', True)