def build_new_model(): 'transfers weight from old model to new model' # import tensorflow as tf s2a_params = [[1, 8, 10], 'emb_cnn', 1] s2a = mb.seq_to_assay_model(*s2a_params) s2a._model.set_model(s2a.get_best_trial()['hyperparam'], xa_len=16, cat_var_len=3, lin_or_sig=s2a.lin_or_sig) s2a.load_model(0) s2e_model = s2a._model.get_seq_embeding_layer_model() space = s2a.get_best_trial()['hyperparam'] filters = int(space['filters']) kernel_size = int(space['kernel_size']) input_drop = space['input_drop'] emb_dim = int(space['AA_emb_dim']) new_s2e = tf.keras.Sequential() new_s2e.add(tf.keras.layers.Embedding(21, emb_dim, input_length=16)) new_s2e.add( tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu')) new_s2e.add(tf.keras.layers.GlobalMaxPool1D(name='seq_embedding')) new_s2e.build((None, 16)) new_s2e.compile() new_s2e.set_weights(s2e_model.get_weights()) new_s2e.save('best_emb_model')
def main(): toggle_no = int(sys.argv[1]) ## If this function is run, then the program must be run through the terminal, with an integer input. The integer input must be within ## the following range [0,4]. This integer input is stored in the variable toggle_no. # c_models=['ridge','fnn','emb_fnn_flat','emb_fnn_maxpool','emb_fnn_maxpool_linear','emb_rnn','small_emb_rnn','small_emb_atn_rnn','small_emb_rnn_linear', # 'emb_cnn','small_emb_cnn','small_emb_atn_cnn','small_emb_cnn_linear'] c_models = [ 'ridge', 'fnn', 'emb_fnn_flat', 'small_emb_rnn_linear', 'emb_cnn' ] ## A string list, c_models is created which has different types of regression models in it. for ss in [0.01, 0.1, .5]: ## An interable object ss is created to iterate through the list containing different sample sizes. c = modelbank.seq_to_assay_model([1, 8, 10], c_models[toggle_no], ss) ## For each sample size, a seq_to_assay_model object defined in the submodels_module.py program is created. It is instantiated with a ## integer list listing the assays to be used to build the model, the 'toggle_no' index of the c_models to determine the regression model used ## and the ss iterbale to determine different sample fractions. c.cross_validate_model() c.test_model() ## Then the cross_validate_model() function of the parent model class is run ## This determines the hyperparameters for the regression model. Finally the ## hyperparameters are used along with the training dataset to train the regression model in the test_model() function. c.save_predictions() ## Then the save_predictions() function defined in the x_to_assay_model parent class is used to save the assay score predictions ## of the test dataset to use with the assay_to_yield_model predictions. if 'emb' in c_models[toggle_no]: ## If the regression model used is of an embedded format then the save_sequence_embeddings() function defined in the ## x_to_assay_model parent class is run to save the sequence embeddings of the model built above. c.save_sequence_embeddings()
def main(): toggle_no = int(sys.argv[1]) c_models = [ 'ridge', 'fnn', 'emb_fnn_flat', 'emb_fnn_maxpool', 'emb_fnn_maxpool_linear', 'emb_rnn', 'small_emb_rnn', 'small_emb_atn_rnn', 'small_emb_rnn_linear', 'emb_cnn', 'small_emb_cnn', 'small_emb_atn_cnn', 'small_emb_cnn_linear' ] c = modelbank.seq_to_assay_model([1, 8, 10], c_models[toggle_no], 1) c.cross_validate_model() c.test_model() c.save_predictions() c.save_sequence_embeddings()
def __init__(self, s2a_params=None, e2y_params=None, Nb_sequences=1000, Nb_positions=16): # TODO: check times for different number of sequences 'nested sampling initilization for number of sequences and number of positions of ordinals' # initilize default model parameters if e2y_params is None: e2y_params = ['svm', 1] if s2a_params is None: s2a_params = [[1, 8, 10], 'emb_cnn', 1] # note: things may change between tensorflow versions seed_parent = int.from_bytes(os.urandom(4), sys.byteorder) self.g_parent = tf.random.experimental.Generator.from_seed(seed_parent) self.original_seq = pd.DataFrame() self.original_seq['Ordinal'] = sm.make_sampling_data( generator=self.g_parent, Nb_sequences=Nb_sequences, Nb_positions=Nb_positions) self.original_seq['Developability'] = np.zeros(Nb_sequences) self.nb_of_sequences = Nb_sequences self.test_seq = self.original_seq.copy() # self.nb_of_sequences,_=np.shape(self.original_seq['Ordinal']) self.s2a = mb.seq_to_assay_model(*s2a_params) # i'm putting zero here b/c it requires a parameter... self.e2y = mb.sequence_embeding_to_yield_model(s2a_params + [0], *e2y_params) self.times = pd.DataFrame() self.start_time = None self.min_yield = [] # parent random number generator self.vp = [] self.percent_pos = [] self.vp_step = [] self.dir_name = [] # TODO: make a run stats file save it to the directory self.run_stats = pd.DataFrame({'e2y'})
def main(): toggle_no = int(sys.argv[1]) ## This function when run should be run on the terminal and it requires an integer input in the range [0,12].this input is stored in toggle_no c_models = [ 'ridge', 'fnn', 'emb_fnn_flat', 'emb_fnn_maxpool', 'emb_fnn_maxpool_linear', 'emb_rnn', 'small_emb_rnn', 'small_emb_atn_rnn', 'small_emb_rnn_linear', 'emb_cnn', 'small_emb_cnn', 'small_emb_atn_cnn', 'small_emb_cnn_linear' ] ## c_models is a string list, where each element corresponds to a regression model. c = modelbank.seq_to_assay_model([1, 8, 10], c_models[toggle_no], 1) ## A seq_to_assay_model object defined in submodels_module is created and instantiated with an integer list showing the assays to be used ## when building the model, the 'toogl_no' index of the c_model to show the kind of regression model used an a float,1, to show the sample ## fraction. c.cross_validate_model() c.test_model() ## The cross_validate_model() and test_model() functions defined in the model parent class are run. This determines the best hyperparameters ## for this particular model, then the model is trained for the given hyperparameters and training dataset. c.save_predictions() c.save_sequence_embeddings()
'Linear Model', 'One-Hot', 'Flatten AA Prop', 'Small Recurrent', 'Small Recurrent + Atn', 'Linear Top, Small Recurrent', 'Convolutional', 'Small Convolutional', 'Small Convolutional + Atn' ] c_names.reverse() ## c_names is a string list where each string is the names of the bars that is to be constructed. The inital order of the c_names list is reversed. c_mdl_test_loss, c_mdl_test_std = [], [] ## Two empty lists c_mdl_test_loss and c_mdl_test_std are created to track the regression loss and the standard deviation of the loss ## for different models. for arch in c_models: ## An iterbale arch is created to work through each element in the c_models list. c_prop = [[1, 8, 10], arch, 1] ## An integer list with different assays to be used to build a model, the iterable arch and a sample fraction of 1 are stored ## in a list c_prop mdl = modelbank.seq_to_assay_model(*c_prop) ## An object mdl, whoch is of type seq_to_assay_model is created. This object is defined in the submodel_module and it is instantiated with ## the elements of the c_prop list. c_mdl_test_loss.append(mdl.model_stats['test_avg_loss']) c_mdl_test_std.append(mdl.model_stats['test_std_loss']) ## The average and standard deviation of the test regression loss, saved in the test_avg_loss and test_std_loss columns in the mdl class dataframe ## model_stats is accessed and added to the c_mdl_test_loss and c_mdl_test_std lists respectively. control_model = modelbank.control_to_assay_model([1, 8, 10], 'ridge', 1) control_loss = control_model.model_stats['test_avg_loss'] exploded_df, _, _ = load_format_data.explode_assays([1, 8, 10], control_model.testing_df) exp_var = np.average(np.square(np.array(exploded_df['y_std']))) ## A new control_to_yield_model() object os created from the submodel_module.py program, the object is instantiated with a ## ridge model regression and a sample fraction of 1. The average test loss of this model is then accessed and stored in control_loss variable. ## Then the explode_yield() function of the load_format_data.py program is run using the testing_df of the mdl object. The output is stored in exploded_df.
start_time = time.time() import submodels_module as mb import load_format_data import pandas as pd import numpy as np # set pandas df of sequences to be predicted, must contain a "Ordinal" column of paratope # the file should be saved under /datasets/ df = ['seq_to_assay_train_1,8,10'] #this is just an example #import sequence_to_assay model (red box) #currently use a embedding_fnn_linear model to predict assays 1,8,10. #will probabaly change when I find the most accurate model s2a_params = [[1, 8, 10], 'emb_fnn_maxpool_linear', 0.01] s2a = mb.seq_to_assay_model(*s2a_params) #now save the sequence embeddings, file is under /datasets/predicted/learned_embedding_[model properties], col='learned_embedding' #saves 3 different embeddings from 3 different models s2a.save_sequence_embeddings(df) #import the embedding_to_yield model (green blox) #average prediction over the 3 different models, sum yield of both cell types #currently using a ridge model, but will probably modify e2y_params = ['ridge', 1] predicted_yield_per_model = [] for i in range(3): #load model e2y = mb.sequence_embeding_to_yield_model(s2a_params + [i], *e2y_params) #save predictions from learned embeddings in s2a model
# Utilizing HT assays for training devrep, and using the devrep embedding to predict yield # assays for training devrep are currently limited to 1,8,10 as those were the most predictive in the first paper # Below is an example of training and testing and predicting the best performing architecture of DevRep ### import submodels_module as modelbank #define model parameters #assays are numbered in order as found in SI table # #model architectures for predicting yield are: ['ridge','fnn','emb_fnn_flat','emb_fnn_maxpool','emb_fnn_maxpool_linear','emb_rnn','small_emb_rnn','small_emb_atn_rnn','small_emb_rnn_linear', # 'emb_cnn','small_emb_cnn','small_emb_atn_cnn','small_emb_cnn_linear'] devrep_mdl_param={'assays':[1,8,10], 'model_architecture':'emb_cnn', 'sample_fraction':1} #initialize model based upon model parameters mdl=modelbank.seq_to_assay_model(**devrep_mdl_param) #cross-validate model mdl.cross_validate_model() #test the model on the limited test set mdl.test_model() #return the results from cv and testing print(mdl.model_stats) #plot the predicted results #figure is saved in ./figures/ mdl.plot() #save the learned embeddings