def __init__(self, model_in, assays, model_architecture, sample_fraction): ## To instantiate the object two strings, an array and a float is required, the array is a list of non-repeating numbers between 1-10 ## These are then used along with the yield string to create the class variables belonging to its parent class. assay_str = ','.join([str(x) for x in assays]) ## The elemenst of the assay list are combined in a string. super().__init__(model_in, 'assay' + assay_str, model_architecture, sample_fraction) ## This is then passed into the model class instantiaon and its respective class variables and functions are also created self.assays = assays self.get_output_and_explode = partial(load_format_data.explode_assays, assays) ## A class variable list called assays is assigned to the input list and the get_output_and_explode is linked to the explode_assays() ## function from load_data_format.py with the assays list input already in it. self.plot_type = plot_model.x_to_assay_plot self.training_df = load_format_data.load_df( 'seq_to_assay_train_1,8,10' ) #could adjust in future for sequences with predictive assays self.testing_df = load_format_data.load_df( 'assay_to_dot_training_data') ## Similarly a plot_type class function is created and linked to the x_to_assay_plot object class in the plot_model.py script ## Similarly two class dataframes called testing_df and training_df are also created by accessing the seq_to_assay_train_1,8,10 ## and the assay_to_dot_training_data respectively. self.lin_or_sig = 'sigmoid' self.num_cv_splits = 3 self.num_cv_repeats = 3 ## A sigmoid regression model is suggested in the lin_or_sig string class variable, then the number of splits and repeats for the cross validatiion is ## specified in the num_cv_splits and num_cv_repeats respectively self.num_test_repeats = 10 self.num_hyp_trials = 50
def save_predictions(self, input_df_description=None): 'saves model predictions for the large dataset' if not input_df_description: input_df_description = 'seq_to_assay_train_' + self.assay_str # only a certain number of these files exist, but more can be created df = load_format_data.load_df(input_df_description) else: df = load_format_data.load_df( 'predicted/' + input_df_description) # for using predicted embeddings OH_matrix = np.eye(2) matrix_col = ['IQ_Average_bc', 'SH_Average_bc'] x_a = self.get_input_seq(df) for z in range(1): # no of models self.load_model(z) for i in range(2): cat_var = [] for j in x_a: cat_var.append(OH_matrix[i].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) df_prediction = self._model.model.predict(x).squeeze().tolist() col_name = matrix_col[i] df.loc[:, col_name] = df_prediction col_name_std = matrix_col[i] + '_std' df.loc[:, col_name_std] = [0] * len(df_prediction) df.to_pickle('./datasets/predicted/' + input_df_description + '_' + self.model_name + '_' + str(z) + '.pkl')
def __init__(self, assays, stringency, model_architecture, sample_fraction): self.assay_str = ','.join([str(x) for x in assays]) super().__init__('twogate' + stringency + '_assays' + self.assay_str, model_architecture, sample_fraction) assay_to_x_model.__init__(self, assays) self.training_df = load_format_data.load_df( 'assay_to_dot_training_data_twogate_' + stringency) self.testing_df = load_format_data.load_df( 'seq_to_dot_test_data_twogate_' + stringency)
def __init__(self, model_in, model_architecture, sample_fraction): super().__init__(model_in, 'yield', model_architecture, sample_fraction) self.get_output_and_explode = load_format_data.explode_yield self.plot_type = plot_model.x_to_yield_plot self.training_df = load_format_data.load_df( 'assay_to_dot_training_data') self.testing_df = load_format_data.load_df('seq_to_dot_test_data') self.num_cv_splits = 10 self.num_cv_repeats = 10 self.num_test_repeats = 3 self.num_hyp_trials = 50
def save_predictions(self, input_df_description=None): 'saves model predictions for the large dataset' ## This function only works for child class that inherits from both this class and any of the LIST_B classes ## The input_df_description is a string of the file containg the data we are going tp access ## If no value is entered for input_df_dicription then the default seq_to_assay_train_1,8,10 data is loaded. ## Or else the subsequent file in the predicted directory is loaded. if not input_df_description: input_df_description = 'seq_to_assay_train_1,8,10' df = load_format_data.load_df( input_df_description ) #will have to adjust if missing datapoints else: df = load_format_data.load_df( 'predicted/' + input_df_description) #for using predicted embeddings OH_matrix = np.eye(2) ## A 2zD identity matrix is created and assigned to to OH_matrix variable matrix_col = ['IQ_Average_bc', 'SH_Average_bc'] ## Another list created with the column heading for the previous OH_matrix as IQ_Average_bc and SH_Avegrage_bc respectively x_a = self.get_input_seq(df) ## Depending on which LIST_B class the child classes inherits from the get_input_seq function is linked to a certain a function and ## returns a particular dataframe. for z in range(1): #no of models self.load_model(z) ## The load_model() function outlined in the model class is run which updates the model class variable of the ## model_architecture.py class objects. for i in range(2): cat_var = [] for j in x_a: cat_var.append(OH_matrix[i].tolist()) ## An empty list cat_var is created with one of its element repeating the same amount of time as the ## x_a dataframe. x = load_format_data.mix_with_cat_var(x_a, cat_var) ## Then the cat_var list along with the x_a dataframe is inputtted into the mix_with_cat_var() function which inturn ## reurns a concatanated list with the x_a and cat_var df_prediction = self._model.model.predict(x).squeeze().tolist() ## This accesses the model class variable for that particular model architecture then using the list created above ## a predicted model is created which is squeezed to remove single dimensional entries and then convert it into a list col_name = matrix_col[i] ## A col_name list tracks the predictions for the IQ and SH average_bc. df.loc[:, col_name] = df_prediction ## Then the inital dataframe df, accessed using the input string, has its column corresponding to the IQ_Average_bc or ## SH_average_bc to match the predictions generated and stored in the df_prediction list. col_name_std = matrix_col[i] + '_std' df.loc[:, col_name_std] = [0] * len(df_prediction) ## Similarly the IC_Avergae_bc_std and SH_Average_bc_std columns are also editted to be a list containing [0] df.to_pickle('./datasets/predicted/' + input_df_description + '_' + self.model_name + '_' + str(z) + '.pkl')
def save_sequence_embeddings(self): 'save sequence embeddings of model' df_list = ['assay_to_dot_training_data', 'seq_to_dot_test_data'] OH_matrix = np.eye(len(self.assays)) for df_name in df_list: df = load_format_data.load_df(df_name) x_a = self.get_input_seq(df) for z in range(3): #for each model for i in range( 1 ): #only need to get cat var for one assay to get sequence embedding cat_var = [] for j in x_a: #for each sequence add cat_var cat_var.append(OH_matrix[i].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) self._model.set_model( self.get_best_trial()['hyperparam'], xa_len=len(x[0]) - len(cat_var[0]), cat_var_len=len(cat_var[0])) #need to build nn arch self.load_model( z) #load pkled sklearn model or weights of nn model seq_embedding_model = self._model.get_seq_embeding_layer_model( ) df_prediction = seq_embedding_model.predict([x]) seq_emb_list = [] for i in df_prediction: seq_emb_list.append([i]) df.loc[:, 'learned_embedding'] = seq_emb_list df.to_pickle('./datasets/predicted/learned_embedding_' + df_name + '_' + self.model_name + '_' + str(z) + '.pkl')
def save_predictions(self): 'save assay score predictions of test dataset to be used with assay-to-yield model' df = load_format_data.load_df( 'seq_to_dot_test_data' ) # will have to adjust if missing datapoints OH_matrix = np.eye(len(self.assays)) x_a = self.get_input_seq(df) for z in range(3): # for each model for i in range(len(self.assays)): # for each assay cat_var = [] for j in x_a: # for each sequence add cat_var cat_var.append(OH_matrix[i].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) self._model.set_model( self.get_best_trial()['hyperparam'], xa_len=len(x[0]) - len(cat_var[0]), cat_var_len=len(cat_var[0]), lin_or_sig=self.lin_or_sig) # need to build nn arch self.load_model( z) # load pkled sklearn model or weights of nn model df_prediction = self._model.model.predict(x).squeeze().tolist() df.loc[:, 'Sort' + str(self.assays[i]) + '_mean_score'] = df_prediction df.to_pickle('./datasets/predicted/seq_to_dot_test_data_' + self.model_name + '_' + str(z) + '.pkl')
def __init__(self, model_in, assays, model_architecture, sample_fraction): assay_str = ','.join([str(x) for x in assays]) super().__init__(model_in, 'assay' + assay_str, model_architecture, sample_fraction) self.assays = assays self.get_output_and_explode = partial(load_format_data.explode_assays, assays) self.plot_type = plot_model.x_to_assay_plot self.training_df = load_format_data.load_df( 'seq_to_assay_train_1,8,9,10' ) #could adjust in future for sequences with predictive assays self.testing_df = load_format_data.load_df( 'assay_to_dot_training_data') self.num_cv_splits = 3 self.num_cv_repeats = 3 self.num_test_repeats = 3 self.num_hyp_trials = 50
def __init__(self, seq_to_assay_model_prop, model_architecture, sample_fraction): self.assay_str = ','.join([str(x) for x in seq_to_assay_model_prop[0]]) seq_to_assay_model_name = 'seq_assay' + self.assay_str + '_' + str( seq_to_assay_model_prop[1]) + '_' + str( seq_to_assay_model_prop[2]) + '_' + str( seq_to_assay_model_prop[3]) super().__init__('embedding_' + seq_to_assay_model_name, model_architecture, sample_fraction) sequence_embedding_to_x_model.__init__(self) self.num_test_repeats = 1 self.training_df = load_format_data.load_df( '/predicted/learned_embedding_assay_to_dot_training_data_' + seq_to_assay_model_name) self.testing_df = load_format_data.load_df( '/predicted/learned_embedding_seq_to_dot_test_data_' + seq_to_assay_model_name)
def apply_predicted_assay_scores(self, seq_to_assay_model_prop): 'uses saved predicted assay scores and saved assay-to-yield model to determine performance on test-set' seq_to_assay_model_name = 'seq_assay' + self.assay_str + '_' + str( seq_to_assay_model_prop[0]) + '_' + str( seq_to_assay_model_prop[1]) + '_' + str( seq_to_assay_model_prop[2]) self.num_test_repeats = 1 self.testing_df = load_format_data.load_df( 'predicted/seq_to_dot_test_data_' + seq_to_assay_model_name) self.figure_file = './figures/' + self.model_name + '_' + seq_to_assay_model_name + '.png' self.stats_file = './model_stats/' + self.model_name + '_' + seq_to_assay_model_name + '.pkl' self.test_model()
def __init__(self, model_in, model_architecture, sample_fraction): ## To instantiate the object two strings and a float is required. These are then used along with the yield string ## to create the class variables belonging to its parent class. super().__init__(model_in, 'yield', model_architecture, sample_fraction) self.get_output_and_explode = load_format_data.explode_yield self.plot_type = plot_model.x_to_yield_plot ## A get_output_and_explode class vfunction is created and linked to the explode_yield() function of the load_format_data.py cript ## Similarly a plot_type class function is created and linked to the x_to_yield_plot object class in the plot_model.py script self.training_df = load_format_data.load_df( 'assay_to_dot_training_data') self.testing_df = load_format_data.load_df('seq_to_dot_test_data') ## Training and testing data is accessed using the load_df() function from the load_format_data.py script ## Training and testing data is accessed from the assay_to_dot_training_data and the seq_to_dot_test_data files self.lin_or_sig = 'linear' self.num_cv_splits = 10 self.num_cv_repeats = 10 ## A linear regression model is suggested in the lin_or_sig string class variable, then the number of splits and repeats for the cross validatiion is ## specified in the num_cv_splits and num_cv_repeats respectively self.num_test_repeats = 10 self.num_hyp_trials = 50
def __init__(self, pred_yield_model_prop, seq_to_pred_yield_prop): super().__init__('seq', seq_to_pred_yield_prop[0], seq_to_pred_yield_prop[1]) seq_to_x_model.__init__(self, seq_to_pred_yield_prop[0]) self.assay_str = ','.join([str(x) for x in pred_yield_model_prop[0]]) pred_yield_model_name = 'assays' + self.assay_str + '_yield_' + pred_yield_model_prop[ 1] + '_' + str(pred_yield_model_prop[2]) + '_' + str( pred_yield_model_prop[3]) self.update_model_name(self.model_name + ':' + pred_yield_model_name) self.training_df = load_format_data.load_df( 'predicted/seq_to_assay_train_1,8,9,10_' + pred_yield_model_name) self.num_cv_splits = 3 self.num_cv_repeats = 3 self.num_test_repeats = 1 self.num_hyp_trials = 50
def save_sequence_embeddings(self, df_list=None): 'save sequence embeddings of model' ## For this function a list input is option. If an input is given then the temporary variable dataframe is set to ## a list containing the name of the two dataframes. if not df_list: df_list = ['assay_to_dot_training_data', 'seq_to_dot_test_data'] OH_matrix = np.eye(len(self.assays)) ## An identity matrix is created the same length as the number of assys used to build the prediction model. for df_name in df_list: df = load_format_data.load_df(df_name) x_a = self.get_input_seq(df) ## For each name in the df_list the dataframe is accessed and the get_input_seq() function is run on it and stored in the x_a dataframe ## Depending on which LIST_B class the child classes (LIST_A objects) inherits from the get_input_seq function is linked to a certain a function and ## returns a particular dataframe. for z in range(3): #for each model for i in range( 1 ): #only need to get cat var for one assay to get sequence embedding ## For each assay in the assays list, a new list called cat_var is created then each element of the OH_matrix is appended ## to the cat_var the same nuber of times as the length of the x_a dataframe. ## Then a dataframe x is created using the mix_with_cat_var() function which is used to run the regression model for the best_trial ## with the hyperparamters specified in the x_a ,x and cat_var dataframes. Then the load_model() function is run ## then the predictions are made using the x dataframe which are then saved in the original dataframe under the assay_score_mean ## FInally the dataframe is saved as a pickle file in the datasets directory in the same combined name of the 'seq_to_dot_test_data' ## along with the model_name. cat_var = [] for j in x_a: #for each sequence add cat_var cat_var.append(OH_matrix[i].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) self._model.set_model( self.get_best_trial()['hyperparam'], xa_len=len(x[0]) - len(cat_var[0]), cat_var_len=len(cat_var[0]), lin_or_sig=self.lin_or_sig) #need to build nn arch self.load_model( z) #load pkled sklearn model or weights of nn model seq_embedding_model = self._model.get_seq_embeding_layer_model( ) df_prediction = seq_embedding_model.predict([x]) seq_emb_list = [] for i in df_prediction: seq_emb_list.append([i]) df.loc[:, 'learned_embedding'] = seq_emb_list df.to_pickle('./datasets/predicted/learned_embedding_' + df_name + '_' + self.model_name + '_' + str(z) + '.pkl')
def apply_predicted_assay_scores(self, seq_to_assay_model_prop): ## This function takes an input called seq_to_assay_model_prop which is an rray or some other ## iterable object 'uses saved predicted assay scores and saved assay-to-yield model to determine performance on test-set' seq_to_assay_model_name = 'seq_assay' + self.assay_str + '_' + str( seq_to_assay_model_prop[0]) + '_' + str( seq_to_assay_model_prop[1]) + '_' + str( seq_to_assay_model_prop[2]) ## First a local string variable is created by combining the 'seq_assay' with self.assay_str, class variable present in child classes ## listed in LIST_A, along with the first 3 elements of the seq_to_assay_model_prop list. self.num_test_repeats = 1 ## The class variable num_test_repeats is changed to 1. self.testing_df = load_format_data.load_df( 'predicted/seq_to_dot_test_data_' + seq_to_assay_model_name) ## Similarly the testing_df dataframe is updated to another file in the predicted directory self.figure_file = './figures/' + self.model_name + '_' + seq_to_assay_model_name + '.png' self.stats_file = './model_stats/' + self.model_name + '_' + seq_to_assay_model_name + '.pkl' ## The strings attached to the figure_file and stats_file from the model parent class is updated, so as to ## access the proper files in the figures and model_stats directories. self.test_model()
def save_predictions(self): 'saves model predictions for the large dataset' df = load_format_data.load_df( 'seq_to_assay_train_1,8,9,10' ) #will have to adjust if missing datapoints OH_matrix = np.eye(2) matrix_col = ['IQ_Average_bc', 'SH_Average_bc'] x_a = self.get_input_seq(df) for z in range(3): #no of models self.load_model(z) for i in range(2): cat_var = [] for j in x_a: cat_var.append(OH_matrix[i].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) df_prediction = self._model.model.predict(x).squeeze().tolist() col_name = matrix_col[i] df.loc[:, col_name] = df_prediction df.to_pickle('./datasets/predicted/seq_to_assay_train_1,8,9,10_' + self.model_name + '_' + str(z) + '.pkl')
def save_predictions(self): 'save assay score predictions of test dataset to be used with assay-to-yield model' ## This function requires no input and it saves assay score prediction df = load_format_data.load_df( 'seq_to_dot_test_data') #will have to adjust if missing datapoints ## Initally the seq_to_dot_test_data file is accessed and it is assigned to the df dataframe OH_matrix = np.eye(len(self.assays)) ## Then an identity matrix is created at the same size as the number of assay used to build the prediction. x_a = self.get_input_seq(df) ## Depending on which LIST_B class the child classes (LIST_A objects) inherits from the get_input_seq function is linked to a certain a function and ## returns a particular dataframe. for z in range(3): #for each model for i in range(len(self.assays)): #for each assay ## For each assay in the assays list, a new list called cat_var is created then each element of the OH_matrix is appended ## to the cat_var the same nuber of times as the length of the x_a dataframe. ## Then a dataframe x is created using the mix_with_cat_var() function which is used to run the regression model for the best_trial ## with the hyperparamters specified in the x_a ,x and cat_var dataframes. Then the load_model() function is run ## then the predictions are made using the x dataframe which are then saved in the original dataframe under the assay_score_mean ## FInally the dataframe is saved as a pickle file in the datasets directory in the same combined name of the 'seq_to_dot_test_data' ## along with the model_name. cat_var = [] for j in x_a: #for each sequence add cat_var cat_var.append(OH_matrix[i].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) self._model.set_model( self.get_best_trial()['hyperparam'], xa_len=len(x[0]) - len(cat_var[0]), cat_var_len=len(cat_var[0]), lin_or_sig=self.lin_or_sig) #need to build nn arch self.load_model( z) #load pkled sklearn model or weights of nn model df_prediction = self._model.model.predict(x).squeeze().tolist() df.loc[:, 'Sort' + str(self.assays[i]) + '_mean_score'] = df_prediction df.to_pickle('./datasets/predicted/seq_to_dot_test_data_' + self.model_name + '_' + str(z) + '.pkl')
predicted_yield_per_model = [] for i in range(3): #load model e2y = mb.sequence_embeding_to_yield_model(s2a_params + [i], *e2y_params) #save predictions from learned embeddings in s2a model input_df_description = 'learned_embedding_' + df[ 0] + '_' + s2a.model_name + '_' + str(i) # saved under input_df_description+embedding model properties, col='IQ_Average_bc','SH_Average_bc' e2y.save_predictions(input_df_description) #load predictions and add the two cell types yield together output_df_description = 'predicted/' + input_df_description + '_' + e2y.model_name + '_' + str( 0) predicted_df = load_format_data.load_df(output_df_description) predicted_iq_yield = predicted_df['IQ_Average_bc'].to_numpy() predicted_sh_yield = predicted_df['SH_Average_bc'].to_numpy() predicted_added_yield = np.sum([predicted_iq_yield, predicted_sh_yield], axis=0) predicted_yield_per_model.append(predicted_added_yield) #average over trials predicted_yield_avg = np.average(predicted_yield_per_model, axis=0) #load original df, save final df with a Developability column (which we want to maximize) df_original = load_format_data.load_df(df[0]) df_original['Developability'] = predicted_yield_avg.tolist() df_original.to_pickle('./datasets/' + df[0] + '_with_predictions.pkl') print("--- %s seconds ---" % (time.time() - start_time))