Esempio n. 1
0
 def __init__(self, model_in, assays, model_architecture, sample_fraction):
     ## To instantiate the object two strings, an array and a float is required, the array is a list of non-repeating numbers between 1-10
     ## These are then used along with the yield string to create the class variables belonging to its parent class.
     assay_str = ','.join([str(x) for x in assays])
     ## The elemenst of the assay list are combined in a string.
     super().__init__(model_in, 'assay' + assay_str, model_architecture,
                      sample_fraction)
     ## This is then passed into the model class instantiaon and its respective class variables and functions are also created
     self.assays = assays
     self.get_output_and_explode = partial(load_format_data.explode_assays,
                                           assays)
     ## A class variable list called assays is assigned to the input list and the get_output_and_explode is linked to the explode_assays()
     ## function from load_data_format.py with the assays list input already in it.
     self.plot_type = plot_model.x_to_assay_plot
     self.training_df = load_format_data.load_df(
         'seq_to_assay_train_1,8,10'
     )  #could adjust in future for sequences with predictive assays
     self.testing_df = load_format_data.load_df(
         'assay_to_dot_training_data')
     ## Similarly a plot_type class function is created and linked to the x_to_assay_plot object class in the plot_model.py script
     ## Similarly two class dataframes called testing_df and training_df are also created by accessing the seq_to_assay_train_1,8,10
     ## and the assay_to_dot_training_data respectively.
     self.lin_or_sig = 'sigmoid'
     self.num_cv_splits = 3
     self.num_cv_repeats = 3
     ## A sigmoid regression model is suggested in the lin_or_sig string class variable, then the number of splits and repeats for the cross validatiion is
     ## specified in the num_cv_splits and num_cv_repeats respectively
     self.num_test_repeats = 10
     self.num_hyp_trials = 50
Esempio n. 2
0
    def save_predictions(self, input_df_description=None):
        'saves model predictions for the large dataset'

        if not input_df_description:
            input_df_description = 'seq_to_assay_train_' + self.assay_str  # only a certain number of these files exist, but more can be created
            df = load_format_data.load_df(input_df_description)
        else:
            df = load_format_data.load_df(
                'predicted/' +
                input_df_description)  # for using predicted embeddings

        OH_matrix = np.eye(2)
        matrix_col = ['IQ_Average_bc', 'SH_Average_bc']
        x_a = self.get_input_seq(df)
        for z in range(1):  # no of models
            self.load_model(z)
            for i in range(2):
                cat_var = []
                for j in x_a:
                    cat_var.append(OH_matrix[i].tolist())
                x = load_format_data.mix_with_cat_var(x_a, cat_var)
                df_prediction = self._model.model.predict(x).squeeze().tolist()
                col_name = matrix_col[i]
                df.loc[:, col_name] = df_prediction
                col_name_std = matrix_col[i] + '_std'
                df.loc[:, col_name_std] = [0] * len(df_prediction)
            df.to_pickle('./datasets/predicted/' + input_df_description + '_' +
                         self.model_name + '_' + str(z) + '.pkl')
Esempio n. 3
0
 def __init__(self, assays, stringency, model_architecture,
              sample_fraction):
     self.assay_str = ','.join([str(x) for x in assays])
     super().__init__('twogate' + stringency + '_assays' + self.assay_str,
                      model_architecture, sample_fraction)
     assay_to_x_model.__init__(self, assays)
     self.training_df = load_format_data.load_df(
         'assay_to_dot_training_data_twogate_' + stringency)
     self.testing_df = load_format_data.load_df(
         'seq_to_dot_test_data_twogate_' + stringency)
 def __init__(self, model_in, model_architecture, sample_fraction):
     super().__init__(model_in, 'yield', model_architecture,
                      sample_fraction)
     self.get_output_and_explode = load_format_data.explode_yield
     self.plot_type = plot_model.x_to_yield_plot
     self.training_df = load_format_data.load_df(
         'assay_to_dot_training_data')
     self.testing_df = load_format_data.load_df('seq_to_dot_test_data')
     self.num_cv_splits = 10
     self.num_cv_repeats = 10
     self.num_test_repeats = 3
     self.num_hyp_trials = 50
Esempio n. 5
0
 def save_predictions(self, input_df_description=None):
     'saves model predictions for the large dataset'
     ## This function only works for child class that inherits from both this class and any of the LIST_B classes
     ## The input_df_description is a string of the file containg the data we are going tp access
     ## If no value is entered for input_df_dicription then the default seq_to_assay_train_1,8,10 data is loaded.
     ## Or else the subsequent file in the predicted directory is loaded.
     if not input_df_description:
         input_df_description = 'seq_to_assay_train_1,8,10'
         df = load_format_data.load_df(
             input_df_description
         )  #will have to adjust if missing datapoints
     else:
         df = load_format_data.load_df(
             'predicted/' +
             input_df_description)  #for using predicted embeddings
     OH_matrix = np.eye(2)
     ## A 2zD identity matrix is created and assigned to to OH_matrix variable
     matrix_col = ['IQ_Average_bc', 'SH_Average_bc']
     ## Another list created with the column heading for the previous OH_matrix as IQ_Average_bc and SH_Avegrage_bc respectively
     x_a = self.get_input_seq(df)
     ## Depending on which LIST_B class the child classes inherits from the get_input_seq function is linked to a certain a function and
     ## returns a particular dataframe.
     for z in range(1):  #no of models
         self.load_model(z)
         ## The load_model() function outlined in the model class is run which updates the model class variable of the
         ## model_architecture.py class objects.
         for i in range(2):
             cat_var = []
             for j in x_a:
                 cat_var.append(OH_matrix[i].tolist())
             ## An empty list cat_var is created with one of its element repeating the same amount of time as the
             ## x_a dataframe.
             x = load_format_data.mix_with_cat_var(x_a, cat_var)
             ## Then the cat_var list along with the x_a dataframe is inputtted into the mix_with_cat_var() function which inturn
             ## reurns a concatanated list with the x_a and cat_var
             df_prediction = self._model.model.predict(x).squeeze().tolist()
             ## This accesses the model class variable for that particular model architecture then using the list created above
             ## a predicted model is created which is squeezed to remove single dimensional entries and then convert it into a list
             col_name = matrix_col[i]
             ## A col_name list tracks the predictions for the IQ and SH average_bc.
             df.loc[:, col_name] = df_prediction
             ## Then the inital dataframe df, accessed using the input string, has its column corresponding to the IQ_Average_bc or
             ## SH_average_bc to match the predictions generated and stored in the df_prediction list.
             col_name_std = matrix_col[i] + '_std'
             df.loc[:, col_name_std] = [0] * len(df_prediction)
             ## Similarly the IC_Avergae_bc_std and SH_Average_bc_std columns are also editted to be a list containing [0]
         df.to_pickle('./datasets/predicted/' + input_df_description + '_' +
                      self.model_name + '_' + str(z) + '.pkl')
    def save_sequence_embeddings(self):
        'save sequence embeddings of model'
        df_list = ['assay_to_dot_training_data', 'seq_to_dot_test_data']
        OH_matrix = np.eye(len(self.assays))

        for df_name in df_list:
            df = load_format_data.load_df(df_name)
            x_a = self.get_input_seq(df)
            for z in range(3):  #for each model
                for i in range(
                        1
                ):  #only need to get cat var for one assay to get sequence embedding
                    cat_var = []
                    for j in x_a:  #for each sequence add cat_var
                        cat_var.append(OH_matrix[i].tolist())
                    x = load_format_data.mix_with_cat_var(x_a, cat_var)
                    self._model.set_model(
                        self.get_best_trial()['hyperparam'],
                        xa_len=len(x[0]) - len(cat_var[0]),
                        cat_var_len=len(cat_var[0]))  #need to build nn arch
                    self.load_model(
                        z)  #load pkled sklearn model or weights of nn model
                    seq_embedding_model = self._model.get_seq_embeding_layer_model(
                    )
                    df_prediction = seq_embedding_model.predict([x])
                    seq_emb_list = []
                    for i in df_prediction:
                        seq_emb_list.append([i])
                    df.loc[:, 'learned_embedding'] = seq_emb_list
                df.to_pickle('./datasets/predicted/learned_embedding_' +
                             df_name + '_' + self.model_name + '_' + str(z) +
                             '.pkl')
Esempio n. 7
0
 def save_predictions(self):
     'save assay score predictions of test dataset to be used with assay-to-yield model'
     df = load_format_data.load_df(
         'seq_to_dot_test_data'
     )  # will have to adjust if missing datapoints
     OH_matrix = np.eye(len(self.assays))
     x_a = self.get_input_seq(df)
     for z in range(3):  # for each model
         for i in range(len(self.assays)):  # for each assay
             cat_var = []
             for j in x_a:  # for each sequence add cat_var
                 cat_var.append(OH_matrix[i].tolist())
             x = load_format_data.mix_with_cat_var(x_a, cat_var)
             self._model.set_model(
                 self.get_best_trial()['hyperparam'],
                 xa_len=len(x[0]) - len(cat_var[0]),
                 cat_var_len=len(cat_var[0]),
                 lin_or_sig=self.lin_or_sig)  # need to build nn arch
             self.load_model(
                 z)  # load pkled sklearn model or weights of nn model
             df_prediction = self._model.model.predict(x).squeeze().tolist()
             df.loc[:, 'Sort' + str(self.assays[i]) +
                    '_mean_score'] = df_prediction
         df.to_pickle('./datasets/predicted/seq_to_dot_test_data_' +
                      self.model_name + '_' + str(z) + '.pkl')
 def __init__(self, model_in, assays, model_architecture, sample_fraction):
     assay_str = ','.join([str(x) for x in assays])
     super().__init__(model_in, 'assay' + assay_str, model_architecture,
                      sample_fraction)
     self.assays = assays
     self.get_output_and_explode = partial(load_format_data.explode_assays,
                                           assays)
     self.plot_type = plot_model.x_to_assay_plot
     self.training_df = load_format_data.load_df(
         'seq_to_assay_train_1,8,9,10'
     )  #could adjust in future for sequences with predictive assays
     self.testing_df = load_format_data.load_df(
         'assay_to_dot_training_data')
     self.num_cv_splits = 3
     self.num_cv_repeats = 3
     self.num_test_repeats = 3
     self.num_hyp_trials = 50
Esempio n. 9
0
 def __init__(self, seq_to_assay_model_prop, model_architecture,
              sample_fraction):
     self.assay_str = ','.join([str(x) for x in seq_to_assay_model_prop[0]])
     seq_to_assay_model_name = 'seq_assay' + self.assay_str + '_' + str(
         seq_to_assay_model_prop[1]) + '_' + str(
             seq_to_assay_model_prop[2]) + '_' + str(
                 seq_to_assay_model_prop[3])
     super().__init__('embedding_' + seq_to_assay_model_name,
                      model_architecture, sample_fraction)
     sequence_embedding_to_x_model.__init__(self)
     self.num_test_repeats = 1
     self.training_df = load_format_data.load_df(
         '/predicted/learned_embedding_assay_to_dot_training_data_' +
         seq_to_assay_model_name)
     self.testing_df = load_format_data.load_df(
         '/predicted/learned_embedding_seq_to_dot_test_data_' +
         seq_to_assay_model_name)
Esempio n. 10
0
 def apply_predicted_assay_scores(self, seq_to_assay_model_prop):
     'uses saved predicted assay scores and saved assay-to-yield model to determine performance on test-set'
     seq_to_assay_model_name = 'seq_assay' + self.assay_str + '_' + str(
         seq_to_assay_model_prop[0]) + '_' + str(
             seq_to_assay_model_prop[1]) + '_' + str(
                 seq_to_assay_model_prop[2])
     self.num_test_repeats = 1
     self.testing_df = load_format_data.load_df(
         'predicted/seq_to_dot_test_data_' + seq_to_assay_model_name)
     self.figure_file = './figures/' + self.model_name + '_' + seq_to_assay_model_name + '.png'
     self.stats_file = './model_stats/' + self.model_name + '_' + seq_to_assay_model_name + '.pkl'
     self.test_model()
Esempio n. 11
0
 def __init__(self, model_in, model_architecture, sample_fraction):
     ## To instantiate the object two strings and a float is required. These are then used along with the yield string
     ## to create the class variables belonging to its parent class.
     super().__init__(model_in, 'yield', model_architecture,
                      sample_fraction)
     self.get_output_and_explode = load_format_data.explode_yield
     self.plot_type = plot_model.x_to_yield_plot
     ## A get_output_and_explode class vfunction is created and linked to the explode_yield() function of the load_format_data.py cript
     ## Similarly a plot_type class function is created and linked to the x_to_yield_plot object class in the plot_model.py script
     self.training_df = load_format_data.load_df(
         'assay_to_dot_training_data')
     self.testing_df = load_format_data.load_df('seq_to_dot_test_data')
     ## Training and testing data is accessed using the load_df() function from the load_format_data.py script
     ## Training and testing data is accessed from the assay_to_dot_training_data and the seq_to_dot_test_data files
     self.lin_or_sig = 'linear'
     self.num_cv_splits = 10
     self.num_cv_repeats = 10
     ## A linear regression model is suggested in the lin_or_sig string class variable, then the number of splits and repeats for the cross validatiion is
     ## specified in the num_cv_splits and num_cv_repeats respectively
     self.num_test_repeats = 10
     self.num_hyp_trials = 50
 def __init__(self, pred_yield_model_prop, seq_to_pred_yield_prop):
     super().__init__('seq', seq_to_pred_yield_prop[0],
                      seq_to_pred_yield_prop[1])
     seq_to_x_model.__init__(self, seq_to_pred_yield_prop[0])
     self.assay_str = ','.join([str(x) for x in pred_yield_model_prop[0]])
     pred_yield_model_name = 'assays' + self.assay_str + '_yield_' + pred_yield_model_prop[
         1] + '_' + str(pred_yield_model_prop[2]) + '_' + str(
             pred_yield_model_prop[3])
     self.update_model_name(self.model_name + ':' + pred_yield_model_name)
     self.training_df = load_format_data.load_df(
         'predicted/seq_to_assay_train_1,8,9,10_' + pred_yield_model_name)
     self.num_cv_splits = 3
     self.num_cv_repeats = 3
     self.num_test_repeats = 1
     self.num_hyp_trials = 50
Esempio n. 13
0
 def save_sequence_embeddings(self, df_list=None):
     'save sequence embeddings of model'
     ## For this function a list input is option. If an input is given then the temporary variable dataframe is set to
     ## a list containing the name of the two dataframes.
     if not df_list:
         df_list = ['assay_to_dot_training_data', 'seq_to_dot_test_data']
     OH_matrix = np.eye(len(self.assays))
     ## An identity matrix is created the same length as the number of assys used to build the prediction model.
     for df_name in df_list:
         df = load_format_data.load_df(df_name)
         x_a = self.get_input_seq(df)
         ## For each name in the df_list the dataframe is accessed and the get_input_seq() function is run on it and stored in the x_a dataframe
         ## Depending on which LIST_B class the child classes (LIST_A objects) inherits from the get_input_seq function is linked to a certain a function and
         ## returns a particular dataframe.
         for z in range(3):  #for each model
             for i in range(
                     1
             ):  #only need to get cat var for one assay to get sequence embedding
                 ## For each assay in the assays list, a new list called cat_var is created then each element of the OH_matrix is appended
                 ## to the cat_var the same nuber of times as the length of the x_a dataframe.
                 ## Then a dataframe x is created using the mix_with_cat_var() function which is used to run the regression model for the best_trial
                 ## with the hyperparamters specified in the x_a ,x and cat_var dataframes. Then the load_model() function is run
                 ## then the predictions are made using the x dataframe which are then saved in the original dataframe under the assay_score_mean
                 ## FInally the dataframe is saved as a pickle file in the datasets directory in the same combined name of the 'seq_to_dot_test_data'
                 ## along with the model_name.
                 cat_var = []
                 for j in x_a:  #for each sequence add cat_var
                     cat_var.append(OH_matrix[i].tolist())
                 x = load_format_data.mix_with_cat_var(x_a, cat_var)
                 self._model.set_model(
                     self.get_best_trial()['hyperparam'],
                     xa_len=len(x[0]) - len(cat_var[0]),
                     cat_var_len=len(cat_var[0]),
                     lin_or_sig=self.lin_or_sig)  #need to build nn arch
                 self.load_model(
                     z)  #load pkled sklearn model or weights of nn model
                 seq_embedding_model = self._model.get_seq_embeding_layer_model(
                 )
                 df_prediction = seq_embedding_model.predict([x])
                 seq_emb_list = []
                 for i in df_prediction:
                     seq_emb_list.append([i])
                 df.loc[:, 'learned_embedding'] = seq_emb_list
             df.to_pickle('./datasets/predicted/learned_embedding_' +
                          df_name + '_' + self.model_name + '_' + str(z) +
                          '.pkl')
Esempio n. 14
0
 def apply_predicted_assay_scores(self, seq_to_assay_model_prop):
     ## This function takes an input called seq_to_assay_model_prop which is an rray or some other
     ## iterable object
     'uses saved predicted assay scores and saved assay-to-yield model to determine performance on test-set'
     seq_to_assay_model_name = 'seq_assay' + self.assay_str + '_' + str(
         seq_to_assay_model_prop[0]) + '_' + str(
             seq_to_assay_model_prop[1]) + '_' + str(
                 seq_to_assay_model_prop[2])
     ## First a local string variable is created by combining the 'seq_assay' with self.assay_str, class variable present in child classes
     ## listed in LIST_A, along with the first 3 elements of the seq_to_assay_model_prop list.
     self.num_test_repeats = 1
     ## The class variable num_test_repeats is changed to 1.
     self.testing_df = load_format_data.load_df(
         'predicted/seq_to_dot_test_data_' + seq_to_assay_model_name)
     ## Similarly the testing_df dataframe is updated to another file in the predicted directory
     self.figure_file = './figures/' + self.model_name + '_' + seq_to_assay_model_name + '.png'
     self.stats_file = './model_stats/' + self.model_name + '_' + seq_to_assay_model_name + '.pkl'
     ## The strings attached to the figure_file and stats_file from the model parent class is updated, so as to
     ## access the proper files in the figures and model_stats directories.
     self.test_model()
 def save_predictions(self):
     'saves model predictions for the large dataset'
     df = load_format_data.load_df(
         'seq_to_assay_train_1,8,9,10'
     )  #will have to adjust if missing datapoints
     OH_matrix = np.eye(2)
     matrix_col = ['IQ_Average_bc', 'SH_Average_bc']
     x_a = self.get_input_seq(df)
     for z in range(3):  #no of models
         self.load_model(z)
         for i in range(2):
             cat_var = []
             for j in x_a:
                 cat_var.append(OH_matrix[i].tolist())
             x = load_format_data.mix_with_cat_var(x_a, cat_var)
             df_prediction = self._model.model.predict(x).squeeze().tolist()
             col_name = matrix_col[i]
             df.loc[:, col_name] = df_prediction
         df.to_pickle('./datasets/predicted/seq_to_assay_train_1,8,9,10_' +
                      self.model_name + '_' + str(z) + '.pkl')
Esempio n. 16
0
 def save_predictions(self):
     'save assay score predictions of test dataset to be used with assay-to-yield model'
     ## This function requires no input and it saves assay score prediction
     df = load_format_data.load_df(
         'seq_to_dot_test_data')  #will have to adjust if missing datapoints
     ## Initally the seq_to_dot_test_data file is accessed and it is assigned to the df dataframe
     OH_matrix = np.eye(len(self.assays))
     ## Then an identity matrix is created at the same size as the number of assay used to build the prediction.
     x_a = self.get_input_seq(df)
     ## Depending on which LIST_B class the child classes (LIST_A objects) inherits from the get_input_seq function is linked to a certain a function and
     ## returns a particular dataframe.
     for z in range(3):  #for each model
         for i in range(len(self.assays)):  #for each assay
             ## For each assay in the assays list, a new list called cat_var is created then each element of the OH_matrix is appended
             ## to the cat_var the same nuber of times as the length of the x_a dataframe.
             ## Then a dataframe x is created using the mix_with_cat_var() function which is used to run the regression model for the best_trial
             ## with the hyperparamters specified in the x_a ,x and cat_var dataframes. Then the load_model() function is run
             ## then the predictions are made using the x dataframe which are then saved in the original dataframe under the assay_score_mean
             ## FInally the dataframe is saved as a pickle file in the datasets directory in the same combined name of the 'seq_to_dot_test_data'
             ## along with the model_name.
             cat_var = []
             for j in x_a:  #for each sequence add cat_var
                 cat_var.append(OH_matrix[i].tolist())
             x = load_format_data.mix_with_cat_var(x_a, cat_var)
             self._model.set_model(
                 self.get_best_trial()['hyperparam'],
                 xa_len=len(x[0]) - len(cat_var[0]),
                 cat_var_len=len(cat_var[0]),
                 lin_or_sig=self.lin_or_sig)  #need to build nn arch
             self.load_model(
                 z)  #load pkled sklearn model or weights of nn model
             df_prediction = self._model.model.predict(x).squeeze().tolist()
             df.loc[:, 'Sort' + str(self.assays[i]) +
                    '_mean_score'] = df_prediction
         df.to_pickle('./datasets/predicted/seq_to_dot_test_data_' +
                      self.model_name + '_' + str(z) + '.pkl')
Esempio n. 17
0
predicted_yield_per_model = []
for i in range(3):
    #load model
    e2y = mb.sequence_embeding_to_yield_model(s2a_params + [i], *e2y_params)

    #save predictions from learned embeddings in s2a model
    input_df_description = 'learned_embedding_' + df[
        0] + '_' + s2a.model_name + '_' + str(i)

    # saved under input_df_description+embedding model properties, col='IQ_Average_bc','SH_Average_bc'
    e2y.save_predictions(input_df_description)

    #load predictions and add the two cell types yield together
    output_df_description = 'predicted/' + input_df_description + '_' + e2y.model_name + '_' + str(
        0)
    predicted_df = load_format_data.load_df(output_df_description)
    predicted_iq_yield = predicted_df['IQ_Average_bc'].to_numpy()
    predicted_sh_yield = predicted_df['SH_Average_bc'].to_numpy()
    predicted_added_yield = np.sum([predicted_iq_yield, predicted_sh_yield],
                                   axis=0)
    predicted_yield_per_model.append(predicted_added_yield)

#average over trials
predicted_yield_avg = np.average(predicted_yield_per_model, axis=0)

#load original df, save final df with a Developability column (which we want to maximize)
df_original = load_format_data.load_df(df[0])
df_original['Developability'] = predicted_yield_avg.tolist()
df_original.to_pickle('./datasets/' + df[0] + '_with_predictions.pkl')

print("--- %s seconds ---" % (time.time() - start_time))