def directed_evolution( s_wt, num_iterations, T, Model, params ): # input = (wild-type sequence, number of mutation iterations, "temperature") s_traj = [ ] # initialize an array to keep records of the protein sequences for this trajectory y_traj = [ ] # initialize an array to keep records of the fitness scores for this trajectory mut_loc_seed = random.randint( 0, len(s_wt) ) # randomely choose the location of the first mutation in the trajectory s, new_mut_loc = mutate_sequence( s_wt, (np.random.poisson(2) + 1), mut_loc_seed ) # initial mutant sequence for this trajectory, with m = Poisson(2)+1 mutations x, _, _ = get_reps( [s], params=params ) # eUniRep representation of the initial mutant sequence for this trajectory y = Model.predict( x ) # predicted fitness score for the initial mutant sequence for this trajectory # iterate through the trial mutation steps for the directed evolution trajectory for i in range(num_iterations): mu = np.random.uniform( 1, 2.5 ) # "mu" parameter for poisson function: used to control how many mutations to introduce m = np.random.poisson( mu - 1) + 1 # how many random mutations to apply to current sequence s_new, new_mut_loc = mutate_sequence( s, m, new_mut_loc ) # new trial sequence, produced from "m" random mutations x_new, _, _ = get_reps([s_new], params=params) y_new = Model.predict(x_new) # new fitness value for trial sequence p = min(1, np.exp( (y_new - y) / T)) # probability function for trial sequence rand_var = random.random() if rand_var < p: # metropolis-Hastings update selection criterion print( str(new_mut_loc + 1) + " " + s[new_mut_loc] + "->" + s_new[new_mut_loc]) s, y = s_new, y_new # if criteria is met, update sequence and corresponding fitness s_traj.append( s ) # update the sequence trajectory records for this iteration of mutagenesis y_traj.append( y ) # update the fitness trajectory records for this iteration of mutagenesis return s_traj, y_traj # output = (sequence record for trajectory, fitness score recorf for trajectory)
def test_get_reps(): a, b, c = get_reps(["ABC"]) d, e, f = get_reps("ABC") assert np.array_equal(a, d) assert np.array_equal(b, e) assert np.array_equal(c, f) h_final, c_final, h_avg = get_reps(["ABC", "DEFGH", "DEF"]) assert h_final.shape == (3, 1900) assert c_final.shape == (3, 1900) assert h_avg.shape == (3, 1900)
def test_sample_one_chain(): starter_sequence = "AAC" n_steps = 10 scoring_func = lambda sequence: get_reps(sequence)[0].sum() chain_data = sample_one_chain(starter_sequence, n_steps, scoring_func) assert set(chain_data.keys()) == set(["sequences", "scores", "accept"]) for k, v in chain_data.items(): # +1 because the first step is included too. assert len(v) == n_steps + 1
def scoring_func_reverse(sequence: str): reps, _, _ = ju.get_reps(sequence, params=deepcopy(params)) return 1 / top_model.predict(reps)
# Optional input: Training parameters parser.add_argument('-p', '--parameters', type=str, default=None, help='Parameter directory for mLSTM.') # Parse arguments args = parser.parse_args() sequence_file = args.infile parameter_dir = args.parameters outfile = args.outfile # Load input sequences sequences = [x.strip() for x in open(sequence_file).readlines()] # Load UniRep parameters if parameter_dir: params = ju.utils.load_params(folderpath=parameter_dir)[0] else: params = None # Make representations for each input sequence h_avg, _, _ = ju.get_reps(sequences, params=params) # Save representations with open(outfile, 'w') as o: np.savetxt(o, h_avg, delimiter="\t")
Sequence sampler test. In this Python script, we start with a sequence, and try to optimize for a better version of it, as measured by the sum over reps. It's a silly task, but I think it gives us ability to sanity-check that we have the right thing going. """ from jax_unirep import get_reps from jax_unirep.sampler import is_accepted, propose starting_sequence = "ASDFGHJKL" current_sequence = starting_sequence current_score = get_reps(current_sequence)[0].sum() sequences = [current_sequence] scores = [current_score] for i in range(100): new_sequence = propose(current_sequence) new_score = get_reps(new_sequence)[0].sum() if is_accepted(best=current_score, candidate=new_score, temperature=1): current_sequence = new_sequence sequences.append(current_sequence) print(i, new_sequence, new_score)