def __init__(self, seed, **kwargs): self.seed = seed defaults = { # PhyloFit "subst_model": "REV", "use_em": True, # PhyloP "method": "SPH", "mode": "CONACC", "aln_alphabet": ["A", "T", "C", "G", "N"], } self.phast = Phast() self.__dict__.update(defaults) self._routine_done = False
class mmEvolution(object): """ Compute Evolutionary features Args: aln_fname (str): Alignment filename. aln (str): Alignment it-self. aln_format (str): Alignment format. Currently supported is FASTA. aln_alphabet (list): List of nucleotides to consider in the aligned sequences (others get filtered). subst_model (str): PhyloFit substitution model (REV...). tree (str): Tree in the Newick format. fitting_tree (bool): Fitting or not the tree on the alignment. use_em (bool): Fitting or not the tree with Expectation-Maximization algorithm. motif_def (str): 'seed' or 'seed_extended' or 'site'. motif_upstream_extension (int): Upstream extension length. motif_downstream_extension (int): Downstream extension length. """ def __init__(self, seed, **kwargs): self.seed = seed defaults = { # PhyloFit "subst_model": "REV", "use_em": True, # PhyloP "method": "SPH", "mode": "CONACC", "aln_alphabet": ["A", "T", "C", "G", "N"], } self.phast = Phast() self.__dict__.update(defaults) self._routine_done = False def _eval_routine(self, setup, worker, **kwargs): # Parameters if "aln_fname" not in kwargs and "aln" not in kwargs: raise IOError("An alignment is required") if "tree" not in kwargs and "fitting_tree" not in kwargs: raise IOError("A tree is required") if "aln_format" not in kwargs and "aln_fname" in kwargs: aln_format = kwargs.get("aln_fname").split(".")[-1].upper() if aln_format == "FA": aln_format = "FASTA" if aln_format is None: raise ValueError("Alignment format undetected") # Load alignment if "aln_fname" in kwargs: seqs = utils.load_fasta(kwargs["aln_fname"]) else: seqs = utils.load_fasta(kwargs["aln"], as_string=True) seqs_cleaned = {} seqs_coords = {} for seq_name, seq in seqs.items(): seqs_cleaned[seq_name] = utils.clean_seq(seq, self.aln_alphabet) seqs_coords[seq_name] = get_coord_vec(seq, self.aln_alphabet) kwargs.update({"seqs": seqs}) args = setup(**kwargs) args.update({"aln_format": aln_format}) # Reset out = [] # Compute for its in range(len(self.seed.end_sites)): end_site = self.seed.end_sites[its] # Motif # start_motif and end_motif are sequence coordinates => 1-based start_motif, end_motif = seed.get_motif_coordinates( end_site, self.motif_def, self.seed.pairings[its], self.motif_upstream_extension, self.motif_downstream_extension, self.seed.min_target_length, ) motif = self.seed.target_seq[start_motif - 1 : end_motif].replace("U", "T") # Species with seed(s) species_with_seed = [] for seq_name, seq in seqs_cleaned.items(): if seq.find(motif) != -1: species_with_seed.append(seq_name) args.update( { "species_with_seed": species_with_seed, "start_motif": start_motif, "end_motif": end_motif, "seqs_coords": seqs_coords, } ) out.append(worker(**args)) return out def _eval_cons_bls(self, **kwargs): def setup(**kwargs): return { "subst_model": kwargs.get("subst_model", self.subst_model), "fitting_tree": kwargs.get("fitting_tree", True), "use_em": kwargs.get("use_em", self.use_em), "fitting_tree_done": False, } def worker(species_with_seed, fitting_tree_done, fitting_tree, subst_model, aln_format, use_em): if len(species_with_seed) > 1: # Fitting tree if necessary if fitting_tree_done is False: if fitting_tree: if kwargs.get("aln_fname"): fitted_tree = self.phast.phylofit( subst_model=subst_model, aln_fname=kwargs["aln_fname"], aln_format=aln_format, tree=kwargs["tree"], use_em=use_em, )["tree"] elif kwargs("aln"): fitted_tree = self.phast.phylofit( subst_model=subst_model, aln=kwargs["aln"], aln_format=aln_format, tree=kwargs["tree"], use_em=use_em, )["tree"] else: fitted_tree = kwargs["tree"] fitting_tree_done = True # Compute BLS dtree = dendropy.Tree.get_from_string(fitted_tree, schema="newick", preserve_underscores=True) dtree.retain_taxa_with_labels(species_with_seed) return sum([edge.length for edge in dtree.postorder_edge_iter()][:-1]) else: return 0.0 self.cons_blss = self._eval_routine(setup, worker, **kwargs) return self.cons_blss def _eval_selec_phylop(self, **kwargs): def setup(**kwargs): return { "method": kwargs.get("method", self.method), "mode": kwargs.get("mode", self.mode), "motif_upstream_extension": kwargs.get("motif_upstream_extension", 0), "motif_downstream_extension": kwargs.get("motif_downstream_extension", 0), "ref_species": kwargs["seq"].keys()[0], "mod_fname": kwargs.get("mod_fname", None), } def worker(species_with_seed, ref_species, start_motif, end_motif, seqs_coords, seqs, method, mode, mod_fname): pval = 1.0 if len(species_with_seed) > 1: # Extract alignment start_motif_in_aln = seqs_coords[ref_species][start_motif - 1] end_motif_in_aln = seqs_coords[ref_species][end_motif - 1] partial_seqs = collections.OrderedDict() for seq_name, seq in seqs.items(): if seq_name in species_with_seed: a = start_motif_in_aln - 1 b = end_motif_in_aln partial_seqs[seq_name] = seq[a:b] partial_seqs = remove_gap_column(partial_seqs) aln = "\n".join(["> %s\n%s" % (k, v) for k, v in partial_seqs.items()]) pval = self.phast.phylop(method=method, mode=mode, mod_fname=mod_fname, aln=aln, aln_format="FASTA") return pval self.selec_phylops = self._eval_routine(setup, worker, **kwargs) return self.selec_phylops def routine(self, **kwargs): try: self._eval_cons_bls(**kwargs) self._eval_selec_phylop(**kwargs) except IOError: self.cons_blss = [0 for _ in range(len(self.seed.end_sites))] self.selec_phylops = [0 for _ in range(len(self.seed.end_sites))] self._routine_done = True @property def cons_bls(self): try: return min(self.cons_blss) except AttributeError: return min(self._eval_cons_bls()) @property def selec_phylop(self): try: return min(self.selec_phylops) except AttributeError: return min(self._eval_selec_phylop())