def save_sequence_embeddings(self): 'save sequence embeddings of model' df_list = ['assay_to_dot_training_data', 'seq_to_dot_test_data'] OH_matrix = np.eye(len(self.assays)) for df_name in df_list: df = load_format_data.load_df(df_name) x_a = self.get_input_seq(df) for z in range(3): #for each model for i in range( 1 ): #only need to get cat var for one assay to get sequence embedding cat_var = [] for j in x_a: #for each sequence add cat_var cat_var.append(OH_matrix[i].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) self._model.set_model( self.get_best_trial()['hyperparam'], xa_len=len(x[0]) - len(cat_var[0]), cat_var_len=len(cat_var[0])) #need to build nn arch self.load_model( z) #load pkled sklearn model or weights of nn model seq_embedding_model = self._model.get_seq_embeding_layer_model( ) df_prediction = seq_embedding_model.predict([x]) seq_emb_list = [] for i in df_prediction: seq_emb_list.append([i]) df.loc[:, 'learned_embedding'] = seq_emb_list df.to_pickle('./datasets/predicted/learned_embedding_' + df_name + '_' + self.model_name + '_' + str(z) + '.pkl')
def save_predictions(self, input_df_description=None,yield2show=None): '''saves model predictions for nested sampling input_df_description : must be a dataframe with a column that contains the learned embedding as saved by ns_seq_to_assay_model.save_sequence_embeddings() as shown above. yield2show: array of booleans of yields to return [ iq yield, sh yield] if iq and sh yield are both true then it will return the sum of the two ''' if yield2show is None: yield2show=np.array([True, True]) df=input_df_description.copy() x_a = self.get_input_seq(df, self.model_no) OH_matrix = np.eye(2) OH_matrix=OH_matrix[yield2show,:].copy() matrix_col = np.array(['IQ_Average_bc', 'SH_Average_bc']) matrix_col=matrix_col[yield2show].copy() p=[] for i in range(len(matrix_col)): cat_var = [] for j in x_a: cat_var.append(OH_matrix[i].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) df_prediction = self._model.model.predict(x).squeeze().tolist() col_name = matrix_col[i] if len(matrix_col) is 1: return df_prediction p.append(df_prediction) # return a sum of the two return np.sum(p,axis=0)
def save_predictions(self, input_df_description=None): 'saves model predictions for the large dataset' if not input_df_description: input_df_description = 'seq_to_assay_train_' + self.assay_str # only a certain number of these files exist, but more can be created df = load_format_data.load_df(input_df_description) else: df = load_format_data.load_df( 'predicted/' + input_df_description) # for using predicted embeddings OH_matrix = np.eye(2) matrix_col = ['IQ_Average_bc', 'SH_Average_bc'] x_a = self.get_input_seq(df) for z in range(1): # no of models self.load_model(z) for i in range(2): cat_var = [] for j in x_a: cat_var.append(OH_matrix[i].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) df_prediction = self._model.model.predict(x).squeeze().tolist() col_name = matrix_col[i] df.loc[:, col_name] = df_prediction col_name_std = matrix_col[i] + '_std' df.loc[:, col_name_std] = [0] * len(df_prediction) df.to_pickle('./datasets/predicted/' + input_df_description + '_' + self.model_name + '_' + str(z) + '.pkl')
def save_predictions(self): 'save assay score predictions of test dataset to be used with assay-to-yield model' df = load_format_data.load_df( 'seq_to_dot_test_data' ) # will have to adjust if missing datapoints OH_matrix = np.eye(len(self.assays)) x_a = self.get_input_seq(df) for z in range(3): # for each model for i in range(len(self.assays)): # for each assay cat_var = [] for j in x_a: # for each sequence add cat_var cat_var.append(OH_matrix[i].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) self._model.set_model( self.get_best_trial()['hyperparam'], xa_len=len(x[0]) - len(cat_var[0]), cat_var_len=len(cat_var[0]), lin_or_sig=self.lin_or_sig) # need to build nn arch self.load_model( z) # load pkled sklearn model or weights of nn model df_prediction = self._model.model.predict(x).squeeze().tolist() df.loc[:, 'Sort' + str(self.assays[i]) + '_mean_score'] = df_prediction df.to_pickle('./datasets/predicted/seq_to_dot_test_data_' + self.model_name + '_' + str(z) + '.pkl')
def format_modelIO(self, df): 'based upon model architecture and catagorical variables create the numpy input (x) and output (y) for the model' df_local, cat_var, y = self.get_output_and_explode( df) #set y, do output firest to explode cat variables x_a = self.get_input_seq( df_local) #set xa (OH seq, Ord seq, assay, control) x = load_format_data.mix_with_cat_var( x_a, cat_var) #mix xa with cat variables return x, y, cat_var
def save_predictions(self,df): 'save assay score predictions of test dataset to be used with assay-to-yield model' OH_matrix = np.eye(len(self.assays)) x_a = self.get_input_seq(df) for z in range(1): # for each model for i in range(len(self.assays)): # for each assay cat_var = [] for j in x_a: # for each sequence add cat_var cat_var.append(OH_matrix[i].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) df_prediction = self._model.model.predict(x).squeeze().tolist() df.loc[:, 'Sort' + str(self.assays[i]) + '_mean_score'] = df_prediction return df
def save_predictions(self, input_df_description=None): 'saves model predictions for the large dataset' ## This function only works for child class that inherits from both this class and any of the LIST_B classes ## The input_df_description is a string of the file containg the data we are going tp access ## If no value is entered for input_df_dicription then the default seq_to_assay_train_1,8,10 data is loaded. ## Or else the subsequent file in the predicted directory is loaded. if not input_df_description: input_df_description = 'seq_to_assay_train_1,8,10' df = load_format_data.load_df( input_df_description ) #will have to adjust if missing datapoints else: df = load_format_data.load_df( 'predicted/' + input_df_description) #for using predicted embeddings OH_matrix = np.eye(2) ## A 2zD identity matrix is created and assigned to to OH_matrix variable matrix_col = ['IQ_Average_bc', 'SH_Average_bc'] ## Another list created with the column heading for the previous OH_matrix as IQ_Average_bc and SH_Avegrage_bc respectively x_a = self.get_input_seq(df) ## Depending on which LIST_B class the child classes inherits from the get_input_seq function is linked to a certain a function and ## returns a particular dataframe. for z in range(1): #no of models self.load_model(z) ## The load_model() function outlined in the model class is run which updates the model class variable of the ## model_architecture.py class objects. for i in range(2): cat_var = [] for j in x_a: cat_var.append(OH_matrix[i].tolist()) ## An empty list cat_var is created with one of its element repeating the same amount of time as the ## x_a dataframe. x = load_format_data.mix_with_cat_var(x_a, cat_var) ## Then the cat_var list along with the x_a dataframe is inputtted into the mix_with_cat_var() function which inturn ## reurns a concatanated list with the x_a and cat_var df_prediction = self._model.model.predict(x).squeeze().tolist() ## This accesses the model class variable for that particular model architecture then using the list created above ## a predicted model is created which is squeezed to remove single dimensional entries and then convert it into a list col_name = matrix_col[i] ## A col_name list tracks the predictions for the IQ and SH average_bc. df.loc[:, col_name] = df_prediction ## Then the inital dataframe df, accessed using the input string, has its column corresponding to the IQ_Average_bc or ## SH_average_bc to match the predictions generated and stored in the df_prediction list. col_name_std = matrix_col[i] + '_std' df.loc[:, col_name_std] = [0] * len(df_prediction) ## Similarly the IC_Avergae_bc_std and SH_Average_bc_std columns are also editted to be a list containing [0] df.to_pickle('./datasets/predicted/' + input_df_description + '_' + self.model_name + '_' + str(z) + '.pkl')
def format_modelIO(self, df): ## df is a dataframe object 'based upon model architecture and catagorical variables create the numpy input (x) and output (y) for the model' ## This function only works for the objects defined in the submodels_module.py which is specified above as LIST_A objects. ## Depending on the object the .get_output_and_explode() function accesses the .explode_yield or the .explode_assay function respectively ## Similarly depening on the object the .get_input_seq() function accesses the .get_ordinal() , .get_onehot() , .get_control(), .get_embedding(), .get_assays() or .get_seq_and_assay() function ## The function listed above that .get_output_and )explode and .get_input_seq functions acesses are available in the load_format_data.py script df_local, cat_var, y = self.get_output_and_explode( df) #set y, do output firest to explode cat variables ## Refer to the load_format_data functions of explode_yield and explode_assay to determine the value in df_local, cat_var and y x_a = self.get_input_seq( df_local) #set xa (OH seq, Ord seq, assay, control) x = load_format_data.mix_with_cat_var( x_a, cat_var) #mix xa with cat variables ## The function returns a tuple with x,y and cat_var return x, y, cat_var
def save_sequence_embeddings(self, df_list=None): 'save sequence embeddings of model' ## For this function a list input is option. If an input is given then the temporary variable dataframe is set to ## a list containing the name of the two dataframes. if not df_list: df_list = ['assay_to_dot_training_data', 'seq_to_dot_test_data'] OH_matrix = np.eye(len(self.assays)) ## An identity matrix is created the same length as the number of assys used to build the prediction model. for df_name in df_list: df = load_format_data.load_df(df_name) x_a = self.get_input_seq(df) ## For each name in the df_list the dataframe is accessed and the get_input_seq() function is run on it and stored in the x_a dataframe ## Depending on which LIST_B class the child classes (LIST_A objects) inherits from the get_input_seq function is linked to a certain a function and ## returns a particular dataframe. for z in range(3): #for each model for i in range( 1 ): #only need to get cat var for one assay to get sequence embedding ## For each assay in the assays list, a new list called cat_var is created then each element of the OH_matrix is appended ## to the cat_var the same nuber of times as the length of the x_a dataframe. ## Then a dataframe x is created using the mix_with_cat_var() function which is used to run the regression model for the best_trial ## with the hyperparamters specified in the x_a ,x and cat_var dataframes. Then the load_model() function is run ## then the predictions are made using the x dataframe which are then saved in the original dataframe under the assay_score_mean ## FInally the dataframe is saved as a pickle file in the datasets directory in the same combined name of the 'seq_to_dot_test_data' ## along with the model_name. cat_var = [] for j in x_a: #for each sequence add cat_var cat_var.append(OH_matrix[i].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) self._model.set_model( self.get_best_trial()['hyperparam'], xa_len=len(x[0]) - len(cat_var[0]), cat_var_len=len(cat_var[0]), lin_or_sig=self.lin_or_sig) #need to build nn arch self.load_model( z) #load pkled sklearn model or weights of nn model seq_embedding_model = self._model.get_seq_embeding_layer_model( ) df_prediction = seq_embedding_model.predict([x]) seq_emb_list = [] for i in df_prediction: seq_emb_list.append([i]) df.loc[:, 'learned_embedding'] = seq_emb_list df.to_pickle('./datasets/predicted/learned_embedding_' + df_name + '_' + self.model_name + '_' + str(z) + '.pkl')
def save_sequence_embeddings(self, df_list=None): # each model is already preloaded df=df_list.copy() 'save sequence embeddings of model for ' OH_matrix = np.eye(len(self.assays)) x_a = self.get_input_seq(df) for i in np.arange(1): # only cat var for one assay to get sequence embedding cat_var = [] for j in x_a: # for each sequence add cat_var cat_var.append(OH_matrix[0].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) seq_embedding_model = self._model.get_seq_embeding_layer_model() df_prediction = seq_embedding_model.predict([x]) seq_emb_list = [] for i in df_prediction: seq_emb_list.append([i]) df.loc[:, 'learned_embedding_' + str(0)] = seq_emb_list # todo : change str to z return df
def save_predictions(self): 'saves model predictions for the large dataset' df = load_format_data.load_df( 'seq_to_assay_train_1,8,9,10' ) #will have to adjust if missing datapoints OH_matrix = np.eye(2) matrix_col = ['IQ_Average_bc', 'SH_Average_bc'] x_a = self.get_input_seq(df) for z in range(3): #no of models self.load_model(z) for i in range(2): cat_var = [] for j in x_a: cat_var.append(OH_matrix[i].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) df_prediction = self._model.model.predict(x).squeeze().tolist() col_name = matrix_col[i] df.loc[:, col_name] = df_prediction df.to_pickle('./datasets/predicted/seq_to_assay_train_1,8,9,10_' + self.model_name + '_' + str(z) + '.pkl')
def save_predictions(self): 'save assay score predictions of test dataset to be used with assay-to-yield model' ## This function requires no input and it saves assay score prediction df = load_format_data.load_df( 'seq_to_dot_test_data') #will have to adjust if missing datapoints ## Initally the seq_to_dot_test_data file is accessed and it is assigned to the df dataframe OH_matrix = np.eye(len(self.assays)) ## Then an identity matrix is created at the same size as the number of assay used to build the prediction. x_a = self.get_input_seq(df) ## Depending on which LIST_B class the child classes (LIST_A objects) inherits from the get_input_seq function is linked to a certain a function and ## returns a particular dataframe. for z in range(3): #for each model for i in range(len(self.assays)): #for each assay ## For each assay in the assays list, a new list called cat_var is created then each element of the OH_matrix is appended ## to the cat_var the same nuber of times as the length of the x_a dataframe. ## Then a dataframe x is created using the mix_with_cat_var() function which is used to run the regression model for the best_trial ## with the hyperparamters specified in the x_a ,x and cat_var dataframes. Then the load_model() function is run ## then the predictions are made using the x dataframe which are then saved in the original dataframe under the assay_score_mean ## FInally the dataframe is saved as a pickle file in the datasets directory in the same combined name of the 'seq_to_dot_test_data' ## along with the model_name. cat_var = [] for j in x_a: #for each sequence add cat_var cat_var.append(OH_matrix[i].tolist()) x = load_format_data.mix_with_cat_var(x_a, cat_var) self._model.set_model( self.get_best_trial()['hyperparam'], xa_len=len(x[0]) - len(cat_var[0]), cat_var_len=len(cat_var[0]), lin_or_sig=self.lin_or_sig) #need to build nn arch self.load_model( z) #load pkled sklearn model or weights of nn model df_prediction = self._model.model.predict(x).squeeze().tolist() df.loc[:, 'Sort' + str(self.assays[i]) + '_mean_score'] = df_prediction df.to_pickle('./datasets/predicted/seq_to_dot_test_data_' + self.model_name + '_' + str(z) + '.pkl')