def initialized_on_problem(self, problem, role): """Find out what sequence it is that we are supposed to conserve.""" if not hasattr(self, 'ends_locations') or self.ends_locations is None: L = len(problem.sequence) wsize = self.window_size ends_locations = [Location(0, wsize), Location(L - wsize, L)] return self.copy_with_changes(ends_locations=ends_locations) else: return self
def evaluate(self, problem): """Score as (-total number of blast identities in matches).""" location = self.location if location is None: location = Location(0, len(problem.sequence)) sequence = location.extract_sequence(problem.sequence) blast_record = blast_sequence(sequence, blast_db=self.blast_db, subject_sequences=self.sequences, word_size=self.word_size, perc_identity=self.perc_identity, num_alignments=self.num_alignments, num_threads=self.num_threads, ungapped=self.ungapped, e_value=self.e_value, culling_limit=self.culling_limit) if isinstance(blast_record, list): alignments = [ alignment for rec in blast_record for alignment in rec.alignments ] else: alignments = blast_record.alignments query_hits = [ (min(hit.query_start, hit.query_end) + location.start - 1, max(hit.query_start, hit.query_end) + location.start, 1 - 2 * (hit.query_start > hit.query_end), hit.identities) for alignment in alignments for hit in alignment.hsps ] locations = sorted([(start, end, ids) for (start, end, strand, ids) in query_hits if (end - start) >= self.min_align_length]) # locations = [ # (r[0][0], r[-1][-1]) # for r in group_nearby_segments(locations, max_start_spread=2) # ] score = -sum([ids for start, end, ids in locations]) locations = [Location(start, end) for start, end, ids in locations] if locations == []: return SpecEvaluation(self, problem, score=1, message="Passed: no BLAST match found") return SpecEvaluation(self, problem, score=score, locations=locations, message="Failed - matches at %s" % locations)
def codon_index_to_location(self, index): if self.location.strand >= 0: return Location( start=self.location.start + 3 * index, end=self.location.start + 3 * (index + 1), strand=1 ) else: return Location( start=self.location.end - 3 * (index + 1), end=self.location.end - 3 * index, strand=-1, )
def evaluate(self, problem): """Return a score equal to -number_of modifications. Locations are "binned" modifications regions. Each bin has a length in nucleotides equal to ``localization_interval_length`.` """ target = self.target_sequence sequence = self.extract_subsequence(problem.sequence) discrepancies = np.nonzero( sequences_differences_array(sequence, target))[0] if self.indices is not None: discrepancies = self.indices[discrepancies] elif self.location is not None: if self.location.strand == -1: discrepancies = self.location.end - discrepancies else: discrepancies = discrepancies + self.location.start intervals = [(r[0], r[-1]) for r in group_nearby_indices( discrepancies, max_group_spread=self.localization_interval_length)] locations = [Location(start, end, 1) for start, end in intervals] return SpecEvaluation(self, problem, score=-len(discrepancies), locations=locations)
def global_evaluation(self, problem): extract_kmer = self.get_kmer_extractor(problem.sequence) kmers_locations = defaultdict(lambda: []) start, end = self.extended_location.start, self.extended_location.end for i in range(start, end - self.min_length): kmers_locations[extract_kmer(i)].append((i, i + self.min_length)) locations = sorted([ Location(start_, end_) for locations_list in kmers_locations.values() for start_, end_ in locations_list if len(locations_list) > 1 and ( self.location.start < start_ < end_ < self.location.end) ], key=lambda l: l.start) if locations == []: return SpecEvaluation( self, problem, score=0, message="Passed: no nonunique %d-mer found." % self.min_length) return SpecEvaluation( self, problem, score=-len(locations), locations=locations, message="Failed, the following positions are the first occurences " "of non-unique segments %s" % locations)
def initialize_on_problem(self, problem, role): """Find out what sequence it is that we are supposed to conserve.""" if self.location is None: result = self.copy_with_changes() result.location = Location(0, len(problem.sequence), 1) return result else: return self
def codons_indices_to_locations(self, indices): """Convert a list of codon positions to a list of Locations""" indices = np.array(indices) if self.location.strand == -1: indices = sorted(self.location.end - indices) return [ Location(group[0] - 3, group[-1], strand=-1) for group in group_nearby_indices( indices, max_group_spread=self.localization_group_spread) ] else: indices += self.location.start return [ Location(group[0], group[-1] + 3) for group in group_nearby_indices( indices, max_group_spread=self.localization_group_spread) ]
def __init__(self, location=None, translation=None, boost=1.0): """Initialize.""" self.translation = translation if isinstance(location, tuple): location = Location.from_tuple(location, default_strand=+1) if (location is not None) and (location.strand not in [-1, 1]): location = Location(location.start, location.end, 1) self.set_location(location) self.boost = boost self.initialize_translation_from_problem = (translation is None) self.initialize_location_from_problem = (location is None)
def initialize_on_problem(self, problem, role): """Find out what sequence it is that we are supposed to conserve.""" if self.location is None: location = Location(0, len(problem.sequence), 1) result = self.copy_with_changes(location=location) else: result = self if self.target_sequence is None: result = result.copy_with_changes() result.target_sequence = self.extract_subsequence(problem.sequence) return result
def initialize_on_problem(self, problem, role='constraint'): """Find out what sequence it is that we are supposed to conserve.""" if self.location is None: location = Location(0, len(problem.sequence), 1) result = self.copy_with_changes(location=location) else: result = self if not all([len(c) == len(result.location) for c in result.choices]): raise ValueError("All sequence choices should have the same " "length as the region on which the spec is " "applied.") return result
def initialize_on_problem(self, problem, role): """Get translation from the sequence if it is not already set.""" if self.location is None: location = Location(0, len(problem.sequence), 1) result = self.copy_with_changes() result.set_location(location) else: result = self if result.translation is None: subsequence = result.location.extract_sequence(problem.sequence) translation = translate(subsequence, self.codons_translations) result = result.copy_with_changes(translation=translation) return result
def __init__(self, max_energy=-5.0, location=None, optimize_initiator=False, boost=1.0): self.max_e = max_energy self.boost = boost self.optimize_initiator = optimize_initiator if isinstance(location, tuple): location = Location.from_tuple(location) if location is not None and (location.strand == -1): location = Location(location.start, location.end, 1) self.location = location
def evaluate(self, problem): """Score is the number of wrong-translation codons.""" location = (self.location if self.location is not None else Location(0, len(problem.sequence))) subsequence = location.extract_sequence(problem.sequence) translation = translate(subsequence, self.codons_translations) errors = [ ind for ind in range(len(translation)) if translation[ind] != self.translation[ind] ] errors_locations = [ Location(3 * ind, 3 * (ind + 1)) if self.location.strand >= 0 else Location(start=self.location.end - 3 * (ind + 1), end=self.location.end - 3 * ind, strand=-1) for ind in errors ] success = (len(errors) == 0) return SpecEvaluation(self, problem, score=-len(errors), locations=errors_locations, message="All OK." if success else "Wrong translation at indices %s" % errors)
def evaluate(self, problem): """Return a score equal to -number_of_equalities. Locations are "binned" equality regions. Each bin has a length in nucleotides equal to ``localization_interval_length`.` """ # FIND THE INDICES WHERE THE SEQUENCE IS UNCHANGED # Note: at this stage any minimum_percent or amount_percent have been # transformed into abolsute self.minimum and self.amount. target = self.reference sequence = self.extract_subsequence(problem.sequence) equalities = np.nonzero( 1 - sequences_differences_array(sequence, target))[0] if self.indices is not None: equalities = self.indices[equalities] elif self.location is not None: if self.location.strand == -1: equalities = self.location.end - equalities else: equalities = equalities + self.location.start def indices_to_intervals(indices): intervals = group_nearby_indices( indices, max_group_spread=self.localization_interval_length) return [(interval[0], interval[-1] + 1) for interval in intervals] if self.indices is not None: n_indices = len(self.indices) else: n_indices = len(self.location) n_differences = n_indices - len(equalities) if self.minimum is not None: score = n_differences - self.minimum intervals = indices_to_intervals(equalities) else: score = -abs(n_differences - self.amount) if n_differences <= self.amount: intervals = indices_to_intervals(equalities) else: differences = [ i for i in self.location.indices if i not in equalities ] intervals = indices_to_intervals(differences) locations = ([self.location] if (self.minimum is not None) else [Location(start, end, 1) for start, end in intervals]) return SpecEvaluation(self, problem, score=score, locations=locations)
def localized(self, location, problem=None, with_righthand=True): """Generic localization method for codon specifications. Calls the class' ``.localized_on_window`` method at the end. """ if self.location is not None: overlap = self.location.overlap_region(location) if overlap is None: return None else: # return self o_start, o_end = overlap.start, overlap.end w_start, w_end = self.location.start, self.location.end if self.location.strand != -1: start_codon = int((o_start - w_start) / 3) end_codon = int((o_end - w_start - 1) / 3) + 1 new_location = Location( start=w_start + 3 * start_codon, end=min(w_end, w_start + 3 * (end_codon)), strand=self.location.strand, ) else: start_codon = int((w_end - o_end) / 3) end_codon = int((w_end - o_start - 1) / 3) + 1 new_location = Location( start=max(w_start, w_end - 3 * (end_codon)), end=w_end - 3 * start_codon, strand=self.location.strand, ) return self.localized_on_window( new_location, start_codon, end_codon ) else: return self
def insert_pattern_in_problem(self, problem, reverse=False): """Insert the pattern in the problem's sequence by successive tries. This heuristic is attempted to get the number of occurences in the pattern from 0 to some number """ sequence_to_insert = self.pattern.sequence if reverse: sequence_to_insert = reverse_complement(sequence_to_insert) L = self.pattern.size starts = range(self.location.start, self.location.end - L) if self.center: center = 0.5 * (self.location.start + self.location.end) starts = sorted(starts, key=lambda s: abs(s - center)) for start in starts: new_location = Location(start, start + L, self.location.strand) new_constraint = EnforceSequence( sequence=sequence_to_insert, location=new_location ) new_space = MutationSpace.from_optimization_problem( problem, new_constraints=[new_constraint] ) if len(new_space.unsolvable_segments) > 0: continue new_sequence = new_space.constrain_sequence(problem.sequence) new_constraints = problem.constraints + [new_constraint] new_problem = DnaOptimizationProblem( sequence=new_sequence, constraints=new_constraints, mutation_space=new_space, logger=None, ) if self.evaluate(new_problem).passes: try: new_problem.resolve_constraints() problem.sequence = new_problem.sequence return except NoSolutionError: pass if (not reverse) and (not self.pattern.is_palyndromic): self.insert_pattern_in_problem(problem, reverse=True) return raise NoSolutionError( problem=problem, location=self.location, message="Insertion of pattern %s in %s failed" % (self.pattern.sequence, self.location), )
def sequence_edits_as_features(self, feature_type="misc_feature"): """Return a list of Biopython Record Features indicating each of the edits.""" segments = sequences_differences_segments( self.sequence, self.sequence_before ) return [ Location(start, end).to_biopython_feature( label="%s=>%s" % (self.sequence_before[start:end], self.sequence[start:end]), is_edit="true", ApEinfo_fwdcolor="#ff0000", color="#ff0000", ) for start, end in segments ]
def evaluate(self, problem): """Return the score (-number_of_hairpins) and hairpins locations.""" sequence = self.location.extract_sequence(problem.sequence) reverse = reverse_complement(sequence) locations = [] for i in range(len(sequence) - self.hairpin_window): word = sequence[i:i + self.stem_size] rest = reverse[-(i + self.hairpin_window):-(i + self.stem_size)] if word in rest: locations.append((i, i + rest.index(word) + len(word))) score = -len(locations) locations = group_nearby_segments(locations, max_start_spread=10) locations = sorted([ Location(l[0][0], l[-1][1] + self.hairpin_window) for l in locations ]) return SpecEvaluation(self, problem, score, locations=locations)
def local_evaluation(self, problem): extract_kmer = self.get_kmer_extractor(problem.sequence) variable_kmers = {} for label in ("location", "extended"): variable_kmers[label] = d = {} for i in self.localization_data[label]["changing_indices"]: kmer = extract_kmer(i) if kmer not in d: d[kmer] = [i] else: d[kmer].append(i) nonunique_locations = [] for kmer, indices in variable_kmers["location"].items(): if len(indices) > 1: nonunique_locations += indices location_variable_kmers = set(variable_kmers["location"].keys()) extended_variable_kmers = set(variable_kmers["extended"].keys()) fixed_location_kmers = self.localization_data["location"]["fixed_kmers"] extended_fixed_kmers = self.localization_data["extended"]["fixed_kmers"] for c in [extended_variable_kmers, fixed_location_kmers, extended_fixed_kmers]: nonunique_locations += [ i for kmer in location_variable_kmers.intersection(c) for i in variable_kmers["location"][kmer] ] for c in [location_variable_kmers, fixed_location_kmers]: nonunique_locations += [ i for kmer in extended_variable_kmers.intersection(c) for i in variable_kmers["extended"][kmer] ] nonunique_locations = [Location(i, i + self.min_length) for i in nonunique_locations] return SpecEvaluation( self, problem, score=-len(nonunique_locations), locations=nonunique_locations, message="Failed, the following positions are the first occurences" "of local non-unique segments %s" % nonunique_locations)
def __init__(self, mini=0, maxi=1.0, target=None, window=None, location=None, boost=1.0): """Initialize.""" if target is not None: mini = maxi = target self.target = target self.mini = mini self.maxi = maxi self.window = window if isinstance(location, tuple): location = Location.from_tuple(location) if location is not None and (location.strand == -1): location = Location(location.start, location.end, 1) self.location = location self.boost = boost
def __init__( self, mini=0, maxi=1.0, target=None, window=None, location=None, boost=1.0, ): """Initialize.""" if isinstance(mini, str): mini, maxi, target, window = self.string_to_parameters(mini) if target is not None: mini = maxi = target self.target = target self.mini = mini self.maxi = maxi self.window = window location = Location.from_data(location) if location is not None and (location.strand == -1): location = Location(location.start, location.end, 1) self.location = location self.boost = boost
def evaluate(self, problem): """Return the sum of breaches extent for all windowed breaches.""" wstart, wend = self.location.start, self.location.end sequence = self.location.extract_sequence(problem.sequence) gc = gc_content(sequence, window_size=self.window) breaches = (np.maximum(0, self.mini - gc) + np.maximum(0, gc - self.maxi)) score = -breaches.sum() breaches_starts = wstart + (breaches > 0).nonzero()[0] if len(breaches_starts) == 0: breaches_locations = [] elif len(breaches_starts) == 1: if self.window is not None: start = breaches_starts[0] breaches_locations = [[start, start + self.window]] else: breaches_locations = [[wstart, wend]] else: segments = [(bs, bs + self.window) for bs in breaches_starts] groups = group_nearby_segments(segments, max_start_spread=max( 1, self.locations_span)) breaches_locations = [(group[0][0], group[-1][-1]) for group in groups] if breaches_locations == []: message = "Passed !" else: breaches_locations = [Location(*loc) for loc in breaches_locations] message = ("Out of bound on segments " + ", ".join([str(l) for l in breaches_locations])) return SpecEvaluation(self, problem, score, locations=breaches_locations, message=message)
def evaluate(self, problem): """Return a score equal to -number_of modifications. Locations are "binned" modifications regions. Each bin has a length in nucleotides equal to ``localization_interval_length`.` """ sequence = self.location.extract_sequence(problem.sequence) discrepancies = np.array([ i for i, nuc in enumerate(sequence) if nuc not in IUPAC_NOTATION[self.sequence[i]] ]) if self.location.strand == -1: discrepancies = self.location.end - discrepancies else: discrepancies = discrepancies + self.location.start intervals = [(r[0], r[-1] + 1) for r in group_nearby_indices( discrepancies, max_group_spread=self.localization_interval_length)] locations = [Location(start, end, 1) for start, end in intervals] return SpecEvaluation(self, problem, score=-len(discrepancies), locations=locations)
def initialize_on_problem(self, problem, role=None): if self.location is None: location = Location(0, len(problem.sequence)) return self.copy_with_changes(location=location) else: return self
def resolve_constraint(self, constraint): """Resolve a constraint through successive localizations.""" # EVALUATE THE CONSTRAINT, FIND BREACHING LOCATIONS evaluation = constraint.evaluate(self) if evaluation.passes: return locations = sorted(evaluation.locations) iterator = self.logger.iter_bar(location=locations, bar_message=lambda loc: str(loc)) # FOR EACH LOCATION, CREATE A LOCAL PROBLEM AND RESOLVE LOCALLY. for i, location in enumerate(iterator): # SEVERAL "EXTENSIONS" OF THE LOCAL ZONE WILL BE TESTED IN TURN # IN CASE THE LOCAL SEQUENCE IS FROZEN DUE TO NUCLEOTIDE INTER- # DEPENDENCIES (CODONS, ETC.) for extension in self.local_extensions: new_location = location.extended(extension) mutation_space = self.mutation_space.localized(new_location) if mutation_space.space_size == 0: # If the sequence is frozen at this location, either # "continue" (go straight to the next, larger extension) # or if we are already in the largest extension, return # an error with data that will be used by the report # generator. if extension != self.local_extensions[-1]: continue else: error = NoSolutionError( location=new_location, problem=self, message="Constraint breach in region that cannot " "be mutated.", ) error.location = new_location error.constraint = constraint error.message = "While solving %s in %s:\n\n%s" % ( constraint, new_location, str(error), ) self.logger( location__index=len(locations), location__message="Cold exit", ) raise error new_location = Location(*mutation_space.choices_span) # This blocks solves the problem of overlapping breaches, # which can make the local optimization impossible. # If the next constraint breach overlaps with the current # location, localize the constraint with a with_righthand=False # flag, which will be used by the constraints ".localized" # method to only consider the right-hand side. if (i < (len(locations) - 1)) and ( locations[i + 1].overlap_region(new_location)): this_local_constraint = constraint.localized( new_location, with_righthand=False, problem=self) else: this_local_constraint = constraint.localized(new_location, problem=self) evaluation = this_local_constraint.evaluate(self) # MAYBE THE LOCAL BREACH WAS ALREADY RESOLVED AS A SIDE EFFECT # OF SOLVING PREVIOUS BREACHES. IN THAT CASE, PASS. if evaluation.passes: continue # ELSE, CREATE A NEW LOCAL PROBLEM WITH LOCALIZED CONSTRAINTS this_local_constraint.is_focus = True this_local_constraint.evaluation = evaluation localized_constraints = [ cst.localized(new_location, problem=self) for cst in self.constraints if cst != constraint and not cst.enforced_by_nucleotide_restrictions ] passing_localized_constraints = [ cst for cst in localized_constraints if cst is not None and cst.evaluate(self).passes ] local_problem = self.__class__( sequence=self.sequence, constraints=([this_local_constraint] + passing_localized_constraints), mutation_space=mutation_space, ) local_problem.randomization_threshold = ( self.randomization_threshold) local_problem.max_random_iters = self.max_random_iters local_problem.mutations_per_iteration = ( self.mutations_per_iteration) # STORE THE LOCAL PROBLEM IN THE LOGGER. # This is useful for troubleshooting. self.logger.store( problem=self, local_problem=local_problem, location=location, ) # RESOLVE THE LOCAL PROBLEM. RETURN AN ERROR IF IT FAILS. try: if hasattr(constraint, "resolution_heuristic"): constraint.resolution_heuristic(local_problem) else: local_problem.resolve_constraints_locally() self._replace_sequence(local_problem.sequence) break except NoSolutionError as error: if extension == self.local_extensions[-1]: error.location = new_location error.constraint = constraint error.message = "While solving %s in %s:\n\n%s" % ( constraint, new_location, str(error), ) self.logger( location__index=len(locations), location__message="Cold exit", ) raise error else: continue
def optimize_objective(self, objective): """Optimize the total objective score, focusing on a single objective. This method will attempt to increase the global objective score by focusing on a single objective. First the locations of under-optimal subsequences for this objective are identified, then these locations are optimized one after the other, left to right. For each location, a local problem is created and the optimization uses either a custom optimization algorithm, an exhaustive search, or a random search, to optimize the local problem """ # EVALUATE OBJECTIVE. RETURN IF THERE IS NOTHING TO BE DONE. evaluation = objective.evaluate(self) locations = evaluation.locations if (objective.best_possible_score is not None) and (evaluation.score == objective.best_possible_score): return # FOR EACH LOCATION, CREATE AND OPTIMIZE A LOCAL PROBLEM. for location in self.logger.iter_bar(location=locations, bar_message=lambda l: str(l)): # Localize the mutation space by freezing any nucleotide outside of # it mutation_space = self.mutation_space.localized(location) if mutation_space.space_size == 0: continue # Update the location so it matches the span of the mutation_space # the resulting location will be equal or smaller to the original # location. location = Location(*mutation_space.choices_span) localized_constraints = [ cst.localized(location, problem=self) for cst in self.constraints ] localized_constraints = [ cst for cst in localized_constraints if cst is not None ] localized_objectives = [ obj.localized(location, problem=self) for obj in self.objectives ] localized_objectives = [ obj for obj in localized_objectives if obj is not None ] local_problem = self.__class__( sequence=self.sequence, constraints=localized_constraints, mutation_space=mutation_space, objectives=localized_objectives, ) self.logger.store(problem=self, local_problem=local_problem, location=location) local_problem.randomization_threshold = ( self.randomization_threshold) local_problem.max_random_iters = self.max_random_iters local_problem.optimization_stagnation_tolerance = ( self.optimization_stagnation_tolerance) local_problem.mutations_per_iteration = ( self.mutations_per_iteration) # OPTIMIZE THE LOCAL PROBLEM if hasattr(objective, "optimization_heuristic"): # Some specifications implement their own optimization method. objective.optimization_heuristic(local_problem) else: # Run an exhaustive or random search depending on the size # of the mutation space. space_size = local_problem.mutation_space.space_size exhaustive_search = space_size < self.randomization_threshold if exhaustive_search: local_problem.optimize_by_exhaustive_search() else: local_problem.optimize_by_random_mutations() # UPDATE THE PROBLEM's SEQUENCE self.sequence = local_problem.sequence
def location_or_default(location): default = Location(0, len(problem.sequence), 1) return default if location is None else location