Python mix_with_cat_var Examples, load_format_data.mix_with_cat_var Python Examples

Example #1

0

Show file

File: submodels_module.py Project: zachschmitz/Developability

    def save_sequence_embeddings(self):
        'save sequence embeddings of model'
        df_list = ['assay_to_dot_training_data', 'seq_to_dot_test_data']
        OH_matrix = np.eye(len(self.assays))

        for df_name in df_list:
            df = load_format_data.load_df(df_name)
            x_a = self.get_input_seq(df)
            for z in range(3):  #for each model
                for i in range(
                        1
                ):  #only need to get cat var for one assay to get sequence embedding
                    cat_var = []
                    for j in x_a:  #for each sequence add cat_var
                        cat_var.append(OH_matrix[i].tolist())
                    x = load_format_data.mix_with_cat_var(x_a, cat_var)
                    self._model.set_model(
                        self.get_best_trial()['hyperparam'],
                        xa_len=len(x[0]) - len(cat_var[0]),
                        cat_var_len=len(cat_var[0]))  #need to build nn arch
                    self.load_model(
                        z)  #load pkled sklearn model or weights of nn model
                    seq_embedding_model = self._model.get_seq_embeding_layer_model(
                    )
                    df_prediction = seq_embedding_model.predict([x])
                    seq_emb_list = []
                    for i in df_prediction:
                        seq_emb_list.append([i])
                    df.loc[:, 'learned_embedding'] = seq_emb_list
                df.to_pickle('./datasets/predicted/learned_embedding_' +
                             df_name + '_' + self.model_name + '_' + str(z) +
                             '.pkl')

Example #2

0

Show file

 def save_predictions(self, input_df_description=None,yield2show=None):
     '''saves model predictions for nested sampling
         input_df_description : must be a dataframe with a column that contains the learned embedding as saved
         by ns_seq_to_assay_model.save_sequence_embeddings() as shown above.
         yield2show: array of booleans of yields to return  [ iq yield, sh yield] if iq and sh yield are both
         true then it will return the sum of the two
     '''
     if yield2show is None:
         yield2show=np.array([True, True])
     df=input_df_description.copy()
     x_a = self.get_input_seq(df, self.model_no)
     OH_matrix = np.eye(2)
     OH_matrix=OH_matrix[yield2show,:].copy()
     matrix_col = np.array(['IQ_Average_bc', 'SH_Average_bc'])
     matrix_col=matrix_col[yield2show].copy()
     p=[]
     for i in range(len(matrix_col)):
         cat_var = []
         for j in x_a:
             cat_var.append(OH_matrix[i].tolist())
         x = load_format_data.mix_with_cat_var(x_a, cat_var)
         df_prediction = self._model.model.predict(x).squeeze().tolist()
         col_name = matrix_col[i]
         if len(matrix_col) is 1:
             return  df_prediction
         p.append(df_prediction)
     # return a sum of the two
     return np.sum(p,axis=0)

Example #3

0

Show file

    def save_predictions(self, input_df_description=None):
        'saves model predictions for the large dataset'

        if not input_df_description:
            input_df_description = 'seq_to_assay_train_' + self.assay_str  # only a certain number of these files exist, but more can be created
            df = load_format_data.load_df(input_df_description)
        else:
            df = load_format_data.load_df(
                'predicted/' +
                input_df_description)  # for using predicted embeddings

        OH_matrix = np.eye(2)
        matrix_col = ['IQ_Average_bc', 'SH_Average_bc']
        x_a = self.get_input_seq(df)
        for z in range(1):  # no of models
            self.load_model(z)
            for i in range(2):
                cat_var = []
                for j in x_a:
                    cat_var.append(OH_matrix[i].tolist())
                x = load_format_data.mix_with_cat_var(x_a, cat_var)
                df_prediction = self._model.model.predict(x).squeeze().tolist()
                col_name = matrix_col[i]
                df.loc[:, col_name] = df_prediction
                col_name_std = matrix_col[i] + '_std'
                df.loc[:, col_name_std] = [0] * len(df_prediction)
            df.to_pickle('./datasets/predicted/' + input_df_description + '_' +
                         self.model_name + '_' + str(z) + '.pkl')

Example #4

0

Show file

 def save_predictions(self):
     'save assay score predictions of test dataset to be used with assay-to-yield model'
     df = load_format_data.load_df(
         'seq_to_dot_test_data'
     )  # will have to adjust if missing datapoints
     OH_matrix = np.eye(len(self.assays))
     x_a = self.get_input_seq(df)
     for z in range(3):  # for each model
         for i in range(len(self.assays)):  # for each assay
             cat_var = []
             for j in x_a:  # for each sequence add cat_var
                 cat_var.append(OH_matrix[i].tolist())
             x = load_format_data.mix_with_cat_var(x_a, cat_var)
             self._model.set_model(
                 self.get_best_trial()['hyperparam'],
                 xa_len=len(x[0]) - len(cat_var[0]),
                 cat_var_len=len(cat_var[0]),
                 lin_or_sig=self.lin_or_sig)  # need to build nn arch
             self.load_model(
                 z)  # load pkled sklearn model or weights of nn model
             df_prediction = self._model.model.predict(x).squeeze().tolist()
             df.loc[:, 'Sort' + str(self.assays[i]) +
                    '_mean_score'] = df_prediction
         df.to_pickle('./datasets/predicted/seq_to_dot_test_data_' +
                      self.model_name + '_' + str(z) + '.pkl')

Example #5

0

Show file

File: model_module.py Project: brycejoh16/Developability

 def format_modelIO(self, df):
     'based upon model architecture and catagorical variables create the numpy input (x) and output (y) for the model'
     df_local, cat_var, y = self.get_output_and_explode(
         df)  #set y, do output firest to explode cat variables
     x_a = self.get_input_seq(
         df_local)  #set xa (OH seq, Ord seq, assay, control)
     x = load_format_data.mix_with_cat_var(
         x_a, cat_var)  #mix xa with cat variables
     return x, y, cat_var

Example #6

0

Show file

    def save_predictions(self,df):
        'save assay score predictions of test dataset to be used with assay-to-yield model'
        OH_matrix = np.eye(len(self.assays))
        x_a = self.get_input_seq(df)
        for z in range(1):  # for each model
            for i in range(len(self.assays)):  # for each assay
                cat_var = []
                for j in x_a:  # for each sequence add cat_var
                    cat_var.append(OH_matrix[i].tolist())
                x = load_format_data.mix_with_cat_var(x_a, cat_var)
                df_prediction = self._model.model.predict(x).squeeze().tolist()
                df.loc[:, 'Sort' + str(self.assays[i]) + '_mean_score'] = df_prediction

            return df

Example #7

0

Show file

 def save_predictions(self, input_df_description=None):
     'saves model predictions for the large dataset'
     ## This function only works for child class that inherits from both this class and any of the LIST_B classes
     ## The input_df_description is a string of the file containg the data we are going tp access
     ## If no value is entered for input_df_dicription then the default seq_to_assay_train_1,8,10 data is loaded.
     ## Or else the subsequent file in the predicted directory is loaded.
     if not input_df_description:
         input_df_description = 'seq_to_assay_train_1,8,10'
         df = load_format_data.load_df(
             input_df_description
         )  #will have to adjust if missing datapoints
     else:
         df = load_format_data.load_df(
             'predicted/' +
             input_df_description)  #for using predicted embeddings
     OH_matrix = np.eye(2)
     ## A 2zD identity matrix is created and assigned to to OH_matrix variable
     matrix_col = ['IQ_Average_bc', 'SH_Average_bc']
     ## Another list created with the column heading for the previous OH_matrix as IQ_Average_bc and SH_Avegrage_bc respectively
     x_a = self.get_input_seq(df)
     ## Depending on which LIST_B class the child classes inherits from the get_input_seq function is linked to a certain a function and
     ## returns a particular dataframe.
     for z in range(1):  #no of models
         self.load_model(z)
         ## The load_model() function outlined in the model class is run which updates the model class variable of the
         ## model_architecture.py class objects.
         for i in range(2):
             cat_var = []
             for j in x_a:
                 cat_var.append(OH_matrix[i].tolist())
             ## An empty list cat_var is created with one of its element repeating the same amount of time as the
             ## x_a dataframe.
             x = load_format_data.mix_with_cat_var(x_a, cat_var)
             ## Then the cat_var list along with the x_a dataframe is inputtted into the mix_with_cat_var() function which inturn
             ## reurns a concatanated list with the x_a and cat_var
             df_prediction = self._model.model.predict(x).squeeze().tolist()
             ## This accesses the model class variable for that particular model architecture then using the list created above
             ## a predicted model is created which is squeezed to remove single dimensional entries and then convert it into a list
             col_name = matrix_col[i]
             ## A col_name list tracks the predictions for the IQ and SH average_bc.
             df.loc[:, col_name] = df_prediction
             ## Then the inital dataframe df, accessed using the input string, has its column corresponding to the IQ_Average_bc or
             ## SH_average_bc to match the predictions generated and stored in the df_prediction list.
             col_name_std = matrix_col[i] + '_std'
             df.loc[:, col_name_std] = [0] * len(df_prediction)
             ## Similarly the IC_Avergae_bc_std and SH_Average_bc_std columns are also editted to be a list containing [0]
         df.to_pickle('./datasets/predicted/' + input_df_description + '_' +
                      self.model_name + '_' + str(z) + '.pkl')

Example #8

0

Show file

File: model_module.py Project: sidlax2503/DevRep

 def format_modelIO(self, df):
     ## df is a dataframe object
     'based upon model architecture and catagorical variables create the numpy input (x) and output (y) for the model'
     ## This function only works for the objects defined in the submodels_module.py which is specified above as LIST_A objects.
     ## Depending on the object the .get_output_and_explode() function accesses the .explode_yield or the .explode_assay function respectively
     ## Similarly depening on the object the .get_input_seq() function accesses the .get_ordinal() , .get_onehot() , .get_control(), .get_embedding(), .get_assays() or .get_seq_and_assay() function
     ## The function listed above that .get_output_and )explode and .get_input_seq functions acesses are available in the load_format_data.py script
     df_local, cat_var, y = self.get_output_and_explode(
         df)  #set y, do output firest to explode cat variables
     ## Refer to the load_format_data functions of explode_yield and explode_assay to determine the value in df_local, cat_var and y
     x_a = self.get_input_seq(
         df_local)  #set xa (OH seq, Ord seq, assay, control)
     x = load_format_data.mix_with_cat_var(
         x_a, cat_var)  #mix xa with cat variables
     ## The function returns a tuple with x,y and cat_var
     return x, y, cat_var

Example #9

0

Show file

 def save_sequence_embeddings(self, df_list=None):
     'save sequence embeddings of model'
     ## For this function a list input is option. If an input is given then the temporary variable dataframe is set to
     ## a list containing the name of the two dataframes.
     if not df_list:
         df_list = ['assay_to_dot_training_data', 'seq_to_dot_test_data']
     OH_matrix = np.eye(len(self.assays))
     ## An identity matrix is created the same length as the number of assys used to build the prediction model.
     for df_name in df_list:
         df = load_format_data.load_df(df_name)
         x_a = self.get_input_seq(df)
         ## For each name in the df_list the dataframe is accessed and the get_input_seq() function is run on it and stored in the x_a dataframe
         ## Depending on which LIST_B class the child classes (LIST_A objects) inherits from the get_input_seq function is linked to a certain a function and
         ## returns a particular dataframe.
         for z in range(3):  #for each model
             for i in range(
                     1
             ):  #only need to get cat var for one assay to get sequence embedding
                 ## For each assay in the assays list, a new list called cat_var is created then each element of the OH_matrix is appended
                 ## to the cat_var the same nuber of times as the length of the x_a dataframe.
                 ## Then a dataframe x is created using the mix_with_cat_var() function which is used to run the regression model for the best_trial
                 ## with the hyperparamters specified in the x_a ,x and cat_var dataframes. Then the load_model() function is run
                 ## then the predictions are made using the x dataframe which are then saved in the original dataframe under the assay_score_mean
                 ## FInally the dataframe is saved as a pickle file in the datasets directory in the same combined name of the 'seq_to_dot_test_data'
                 ## along with the model_name.
                 cat_var = []
                 for j in x_a:  #for each sequence add cat_var
                     cat_var.append(OH_matrix[i].tolist())
                 x = load_format_data.mix_with_cat_var(x_a, cat_var)
                 self._model.set_model(
                     self.get_best_trial()['hyperparam'],
                     xa_len=len(x[0]) - len(cat_var[0]),
                     cat_var_len=len(cat_var[0]),
                     lin_or_sig=self.lin_or_sig)  #need to build nn arch
                 self.load_model(
                     z)  #load pkled sklearn model or weights of nn model
                 seq_embedding_model = self._model.get_seq_embeding_layer_model(
                 )
                 df_prediction = seq_embedding_model.predict([x])
                 seq_emb_list = []
                 for i in df_prediction:
                     seq_emb_list.append([i])
                 df.loc[:, 'learned_embedding'] = seq_emb_list
             df.to_pickle('./datasets/predicted/learned_embedding_' +
                          df_name + '_' + self.model_name + '_' + str(z) +
                          '.pkl')

Example #10

0

Show file

    def save_sequence_embeddings(self, df_list=None):
        # each model is already preloaded
        df=df_list.copy()
        'save sequence embeddings of model for '
        OH_matrix = np.eye(len(self.assays))
        x_a = self.get_input_seq(df)
        for i in np.arange(1): # only cat var for one assay to get sequence embedding
            cat_var = []
            for j in x_a:  # for each sequence add cat_var
                cat_var.append(OH_matrix[0].tolist())
            x = load_format_data.mix_with_cat_var(x_a, cat_var)
            seq_embedding_model = self._model.get_seq_embeding_layer_model()
            df_prediction = seq_embedding_model.predict([x])
            seq_emb_list = []
            for i in df_prediction:
                seq_emb_list.append([i])
            df.loc[:, 'learned_embedding_' + str(0)] = seq_emb_list # todo : change str to z

        return df

Example #11

0

Show file

File: submodels_module.py Project: zachschmitz/Developability

 def save_predictions(self):
     'saves model predictions for the large dataset'
     df = load_format_data.load_df(
         'seq_to_assay_train_1,8,9,10'
     )  #will have to adjust if missing datapoints
     OH_matrix = np.eye(2)
     matrix_col = ['IQ_Average_bc', 'SH_Average_bc']
     x_a = self.get_input_seq(df)
     for z in range(3):  #no of models
         self.load_model(z)
         for i in range(2):
             cat_var = []
             for j in x_a:
                 cat_var.append(OH_matrix[i].tolist())
             x = load_format_data.mix_with_cat_var(x_a, cat_var)
             df_prediction = self._model.model.predict(x).squeeze().tolist()
             col_name = matrix_col[i]
             df.loc[:, col_name] = df_prediction
         df.to_pickle('./datasets/predicted/seq_to_assay_train_1,8,9,10_' +
                      self.model_name + '_' + str(z) + '.pkl')

Example #12

0

Show file

 def save_predictions(self):
     'save assay score predictions of test dataset to be used with assay-to-yield model'
     ## This function requires no input and it saves assay score prediction
     df = load_format_data.load_df(
         'seq_to_dot_test_data')  #will have to adjust if missing datapoints
     ## Initally the seq_to_dot_test_data file is accessed and it is assigned to the df dataframe
     OH_matrix = np.eye(len(self.assays))
     ## Then an identity matrix is created at the same size as the number of assay used to build the prediction.
     x_a = self.get_input_seq(df)
     ## Depending on which LIST_B class the child classes (LIST_A objects) inherits from the get_input_seq function is linked to a certain a function and
     ## returns a particular dataframe.
     for z in range(3):  #for each model
         for i in range(len(self.assays)):  #for each assay
             ## For each assay in the assays list, a new list called cat_var is created then each element of the OH_matrix is appended
             ## to the cat_var the same nuber of times as the length of the x_a dataframe.
             ## Then a dataframe x is created using the mix_with_cat_var() function which is used to run the regression model for the best_trial
             ## with the hyperparamters specified in the x_a ,x and cat_var dataframes. Then the load_model() function is run
             ## then the predictions are made using the x dataframe which are then saved in the original dataframe under the assay_score_mean
             ## FInally the dataframe is saved as a pickle file in the datasets directory in the same combined name of the 'seq_to_dot_test_data'
             ## along with the model_name.
             cat_var = []
             for j in x_a:  #for each sequence add cat_var
                 cat_var.append(OH_matrix[i].tolist())
             x = load_format_data.mix_with_cat_var(x_a, cat_var)
             self._model.set_model(
                 self.get_best_trial()['hyperparam'],
                 xa_len=len(x[0]) - len(cat_var[0]),
                 cat_var_len=len(cat_var[0]),
                 lin_or_sig=self.lin_or_sig)  #need to build nn arch
             self.load_model(
                 z)  #load pkled sklearn model or weights of nn model
             df_prediction = self._model.model.predict(x).squeeze().tolist()
             df.loc[:, 'Sort' + str(self.assays[i]) +
                    '_mean_score'] = df_prediction
         df.to_pickle('./datasets/predicted/seq_to_dot_test_data_' +
                      self.model_name + '_' + str(z) + '.pkl')