Example #1
0
    def propose_sequences(
            self,
            measured_sequences: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """Propose top `sequences_batch_size` sequences for evaluation."""
        old_sequences = measured_sequences["sequence"]
        old_sequence_set = set(old_sequences)
        new_seqs = set()

        while len(new_seqs) <= self.model_queries_per_batch:
            seq = self.rng.choice(old_sequences)
            new_seq = s_utils.generate_random_mutant(seq,
                                                     self.mu / len(seq),
                                                     alphabet=self.alphabet)

            if new_seq not in old_sequence_set:
                new_seqs.add(new_seq)

        new_seqs = np.array(list(new_seqs))
        preds = self.model.get_fitness(new_seqs)

        if self.elitist:
            idxs = np.argsort(preds)[:-self.sequences_batch_size:-1]
        else:
            idxs = self.rng.integers(0,
                                     len(new_seqs),
                                     size=self.sequences_batch_size)

        return new_seqs[idxs], preds[idxs]
Example #2
0
    def propose_sequences(
            self,
            measured_sequences: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """Propose top `sequences_batch_size` sequences for evaluation."""
        # Set the torch seed by generating a random integer from the pre-seeded self.rng
        torch.manual_seed(self.rng.integers(-(2**31), 2**31))

        measured_sequence_set = set(measured_sequences["sequence"])

        # Create initial population by choosing parents from `measured_sequences`
        initial_pop_inds = self._choose_parents(
            measured_sequences["true_score"].to_numpy(),
            self.population_size,
        )
        pop = measured_sequences["sequence"].to_numpy()[initial_pop_inds]
        scores = measured_sequences["true_score"].to_numpy()[initial_pop_inds]

        sequences = {}
        initial_cost = self.model.cost
        while (self.model.cost - initial_cost + self.population_size <
               self.model_queries_per_batch):
            # Create "children" by recombining parents selected from population
            # according to self.parent_selection_strategy and
            # self.recombination_strategy
            num_children = int(self.children_proportion * self.population_size)
            parents = pop[self._choose_parents(scores, num_children)]

            # Single-point mutation of children (for now)
            children = []
            for seq in parents:
                child = s_utils.generate_random_mutant(seq, 1 / len(seq),
                                                       self.alphabet)

                if child not in measured_sequence_set and child not in sequences:
                    children.append(child)

            if len(children) == 0:
                continue

            children = np.array(children)
            child_scores = self.model.get_fitness(children)

            # Now kick out the worst samples and replace them with the new children
            argsorted_scores = np.argsort(scores)
            pop[argsorted_scores[:len(children)]] = children
            scores[argsorted_scores[:len(children)]] = child_scores

            sequences.update(zip(children, child_scores))

        # We propose the top `self.sequences_batch_size`
        # new sequences we have generated
        new_seqs = np.array(list(sequences.keys()))
        preds = np.array(list(sequences.values()))
        sorted_order = np.argsort(preds)[:-self.sequences_batch_size:-1]

        return new_seqs[sorted_order], preds[sorted_order]
Example #3
0
    def _extend_samples(self, samples, weights):
        # generate random seqs around the input seq if the sample size is too small
        samples = list(samples)
        weights = list(weights)
        sequences = set(samples)
        while len(sequences) < 100:
            sample = random.choice(samples)
            sample = s_utils.generate_random_mutant(sample,
                                                    self.mutation_rate,
                                                    alphabet=self.alphabet)

            if sample not in sequences:
                samples.append(sample)
                weights.append(1)
                sequences.add(sample)

        return np.array(samples), np.array(weights)
Example #4
0
    def propose_sequences(
            self,
            measured_sequences: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """Propose top `sequences_batch_size` sequences for evaluation."""
        measured_sequence_set = set(measured_sequences["sequence"])

        top_fitness = measured_sequences["true_score"].max()
        top_inds = measured_sequences["true_score"] >= top_fitness * (
            1 - np.sign(top_fitness) * self.threshold)

        parents = np.resize(
            measured_sequences["sequence"][top_inds].to_numpy(),
            self.sequences_batch_size,
        )

        sequences = {}
        previous_model_cost = self.model.cost
        while self.model.cost - previous_model_cost < self.model_queries_per_batch:
            # generate recombinant mutants
            for i in range(self.rho):
                parents = self._recombine_population(parents)

            for i in range(0, len(parents), self.eval_batch_size):
                # Here we do rollouts from each parent (root of rollout tree)
                roots = parents[i:i + self.eval_batch_size]
                root_fitnesses = self.model.get_fitness(roots)

                nodes = list(enumerate(roots))

                while (len(nodes) > 0
                       and self.model.cost - previous_model_cost +
                       self.eval_batch_size < self.model_queries_per_batch):
                    child_idxs = []
                    children = []
                    while len(children) < len(nodes):
                        idx, node = nodes[len(children) - 1]

                        child = s_utils.generate_random_mutant(
                            node,
                            self.mu * 1 / len(node),
                            self.alphabet,
                        )

                        # Stop when we generate new child that has never been seen
                        # before
                        if (child not in measured_sequence_set
                                and child not in sequences):
                            child_idxs.append(idx)
                            children.append(child)

                    # Stop the rollout once the child has worse predicted
                    # fitness than the root of the rollout tree.
                    # Otherwise, set node = child and add child to the list
                    # of sequences to propose.
                    fitnesses = self.model.get_fitness(children)
                    sequences.update(zip(children, fitnesses))

                    nodes = []
                    for idx, child, fitness in zip(child_idxs, children,
                                                   fitnesses):
                        if fitness >= root_fitnesses[idx]:
                            nodes.append((idx, child))

        # We propose the top `self.sequences_batch_size` new sequences we have generated
        new_seqs = np.array(list(sequences.keys()))
        preds = np.array(list(sequences.values()))
        sorted_order = np.argsort(preds)[:-self.sequences_batch_size:-1]

        return new_seqs[sorted_order], preds[sorted_order]
Example #5
0
    def propose_sequences(
        self, measured_sequences_data: pd.DataFrame
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Propose top `sequences_batch_size` sequences for evaluation."""
        # If we are on the first round, our model has no data yet, so the
        # best policy is to propose random sequences in a small neighborhood.
        last_round = measured_sequences_data["round"].max()
        if last_round == 0:
            sequences = set()
            while len(sequences) < self.sequences_batch_size:
                sequences.add(
                    s_utils.generate_random_mutant(
                        self.starting_sequence,
                        2 / len(self.starting_sequence),
                        self.alphabet,
                    ))

            sequences = np.array(list(sequences))
            return sequences, self.model.get_fitness(sequences)

        last_round_sequences = measured_sequences_data[
            measured_sequences_data["round"] == last_round]

        # gamma is our threshold (the self.Q-th percentile of sequences from last round)
        # we will pick all of last round's sequences with fitness above the Qth
        # percentile
        gamma = np.percentile(last_round_sequences["true_score"], 100 * self.Q)
        initial_batch = last_round_sequences["sequence"][
            last_round_sequences["true_score"] >= gamma].to_numpy()
        initial_weights = np.ones(len(initial_batch))

        initial_batch, initial_weights = self._extend_samples(
            initial_batch, initial_weights)
        all_samples_and_weights = tuple((initial_batch, initial_weights))

        # this will be the current state of the generator
        self.generator.train_model(initial_batch, initial_weights)

        # save the weights of the initial vae and save it as vae_0:
        # there are issues with keras model saving and loading,
        # so we have to recompile it
        generator_0 = VAE(
            seq_length=self.generator.seq_length,
            alphabet=self.generator.alphabet,
            batch_size=self.generator.batch_size,
            latent_dim=self.generator.latent_dim,
            intermediate_dim=self.generator.intermediate_dim,
            epochs=self.generator.epochs,
            epsilon_std=self.generator.epsilon_std,
            beta=self.generator.beta,
            validation_split=self.generator.validation_split,
            verbose=self.generator.verbose,
        )
        original_weights = self.generator.vae.get_weights()
        generator_0.vae.set_weights(original_weights)
        vae_0 = generator_0.vae

        sequences = {}
        previous_model_cost = self.model.cost
        while self.model.cost - previous_model_cost < self.model_queries_per_batch:
            # generate new samples using the generator (second argument is a list of all
            # existing measured and proposed seqs)
            proposals = []
            proposals = self.generator.generate(
                self.cycle_batch_size,
                all_samples_and_weights[0],
                all_samples_and_weights[1],
            )
            print(self.model.cost - previous_model_cost, len(proposals))

            # calculate the scores of the new samples using the model
            scores = self.model.get_fitness(proposals)

            # set a new fitness threshold if the new percentile is
            # higher than the current
            gamma = max(np.percentile(scores, self.Q * 100), gamma)

            # cbas and dbas mostly the same except cbas also does an importance
            # sampling step
            if self.algo == "cbas":
                # calculate the weights for the proposed batch
                log_probs_0 = self.generator.calculate_log_probability(
                    proposals, vae=vae_0)
                log_probs_t = self.generator.calculate_log_probability(
                    proposals)

                weights = np.exp(log_probs_0 - log_probs_t)
                weights = np.nan_to_num(weights)

            # Otherwise, `self.algo == "dbas"`
            else:
                weights = np.ones(len(proposals))

            weights[scores < gamma] = 0

            # add proposed samples to the total sample pool
            all_samples = np.append(all_samples_and_weights[0], proposals)
            all_weights = np.append(all_samples_and_weights[1], weights)
            all_samples_and_weights = (all_samples, all_weights)

            # update the generator
            # print('New training set size: ', len(all_samples_and_weights[0]))
            self.generator.train_model(all_samples_and_weights[0],
                                       all_samples_and_weights[1])

            sequences.update(zip(proposals, scores))

        # We propose the top `self.sequences_batch_size` new sequences we have generated
        new_seqs = np.array(list(sequences.keys()))
        preds = np.array(list(sequences.values()))
        sorted_order = np.argsort(preds)[:-self.sequences_batch_size:-1]

        return new_seqs[sorted_order], preds[sorted_order]