def seq_to_yield_simple(self): self.compare_test = False self.get_control = self.get_assay_control b_models = ['ridge', 'forest', 'svm', 'fnn'] model_list = [] for arch in b_models: model_list.append(modelbank.seq_to_yield_model(arch, 1)) self.plot_bar(model_list, 'seq_to_yield_simple')
def get_best_seq_to_yield_simple(self): self.compare_test = False self.get_control = self.get_assay_control b_models = ['ridge', 'forest', 'svm', 'fnn'] model_list = [] for arch in b_models: model_list.append(modelbank.seq_to_yield_model(arch, 1)) best_model = self.get_best_model(model_list) return best_model
def get_best_seq_to_yield_simple(self): self.compare_test=False ## First sets the comapre_test class boolean to false, then creates a new class variable get_control to ## get_control and links it to the function get_assay_control self.get_control=self.get_assay_control ## b_models is a list containing different simple regression models used to build the seq to yield correlation b_models=['ridge','forest','svm','fnn'] model_list=[] for arch in b_models: ## for each different type of the regression model a seq_to_yield object specified in submodel_module.py program, ## is built with a sample fraction of 1 and this in turn is added to the temporary model_list list. model_list.append(modelbank.seq_to_yield_model(arch,1)) best_model=self.get_best_model(model_list) ## The model_list compiled is run through the get_best_model() function and the output from it is returned in this function. return best_model
def main(): ''' compare test performances when reducing training sample size. This version is for first paper, predicting yield from assays and one-hot encoded sequence. ''' a = int(sys.argv[1]) if a < 4: b = 0 elif a < 8: a = a - 4 b = 1 elif a < 12: a = a - 8 b = 2 elif a == 12: b = 3 a = a - 12 else: print('incorrect toggle number') arch_list = ['ridge', 'svm', 'forest', 'fnn'] # size_list=[0.055,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1] size_list = [0.7, 0.8, 0.9, 1] for size in size_list: if b == 0: mdl = modelbank.seqandassay_to_yield_model([1, 8, 10], arch_list[a], size) elif b == 1: #1,5,9,12 mdl = modelbank.assay_to_yield_model([1, 8, 10], arch_list[a], size) elif b == 2: mdl = modelbank.seq_to_yield_model(arch_list[a], size) elif b == 3: mdl = modelbank.control_to_yield_model(arch_list[a], size) for seed in range(9): #no seed is seed=42 mdl.change_sample_seed(seed) mdl.cross_validate_model() mdl.limit_test_set([1, 8, 10]) mdl.test_model()
mdl = modelbank.assay_to_yield_model([1, 8, 10], arch, 1) elif i == 1: mdl = modelbank.weighted_assay_to_yield_model([1, 8, 10], arch, 1) elif i == 2: mdl = modelbank.seqandassay_to_yield_model([1, 8, 10], arch, 1) else: mdl = modelbank.seqandweightedassay_to_yield_model([1, 8, 10], arch, 1) if mdl.model_stats['cv_avg_loss'] < cv_loss: cv_loss = mdl.model_stats['cv_avg_loss'] test_loss = mdl.model_stats['test_avg_loss'] test_std = mdl.model_stats['test_std_loss'] loss_per_model.append(test_loss) std_per_model.append(test_std) seq_model = modelbank.seq_to_yield_model('forest', 1) seq_loss = seq_model.model_stats['test_avg_loss'] seq_std = seq_model.model_stats['test_std_loss'] x = [-0.3, 0.8] seq_plus = [seq_loss + seq_std] * 2 seq_min = [seq_loss - seq_std] * 2 control_model = modelbank.control_to_yield_model('ridge', 1) control_loss = control_model.model_stats['test_avg_loss'] control_model.limit_test_set([1, 8, 10]) exploded_df, _, _ = load_format_data.explode_yield(control_model.testing_df) exp_var = np.average(np.square(np.array(exploded_df['y_std']))) fig, ax = plt.subplots(1, 1, figsize=[2, 2], dpi=300) xloc = [0, 0.5]
def main(): ''' compare test performances when reducing training sample size. This version is for first paper, predicting yield from assays and one-hot encoded sequence. ''' ## A command line input is required when running this program. The integer input ## should be between 0-12. a=int(sys.argv[1]) if a<4: b=0 ## if the input is less than 4 then b value is set to 0 elif a<8: a=a-4 b=1 ## if a is between 4-8 then the b value is set to 1 and a is reduced by 4 elif a<12: a=a-8 b=2 ## if a is between 8-12 then the b value is set to 2 and a is reduced by 8 elif a==12: b=3 a=a-12 ## if a is equal to 12 then the b value is set to 3 and a is set to 0. else: print('incorrect toggle number') ## If the inout is out of bounds then an error message is printed. arch_list=['ridge','svm','forest','fnn'] ## A string list is created containing the names of the different regression models and stored as arch_list # size_list=[0.055,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1] size_list=[0.7,0.8,0.9,1] ## A float list is created containing varying amounts of sample fractions and stored as size_list for size in size_list: ## each element in the size_list array, we check the value of the b value created in the above if-else ## statements and this dictates the kind of submodel_module.py object created ## if b = 0, then a seqandassay_to_yield_model object is created with an assay list of [1,8,10] ## a regression model dictated by the 'a' index of the arch_list and the size determined by the iteration of size_list if b==0: mdl=modelbank.seqandassay_to_yield_model([1,8,10],arch_list[a],size) ## if b = 1, then a assay_to_yield_model object is created with an assay list of [1,8,10] ## a regression model dictated by the 'a' index of the arch_list and the size determined by the iteration of size_list elif b==1: #1,5,9,12 mdl=modelbank.assay_to_yield_model([1,8,10],arch_list[a],size) ## if b = 2, then a seq_to_yield_model object is created with a regression model dictated by ## the 'a' index of the arch_list and the size determined by the iteration of size_list elif b==2: mdl=modelbank.seq_to_yield_model(arch_list[a],size) ## if b = 3, then a control_to_yield_model object is created with a regression model dictated by ## the 'a' index of the arch_list and the size determined by the iteration of size_list elif b==3: mdl=modelbank.control_to_yield_model(arch_list[a],size) for seed in range(9): #no seed is seed=42 ## For each element in the int range [0,9). The sample_seed class int to the element ## Then the trial data, model data and plots are updated to reflect the new sample_seed size mdl.change_sample_seed(seed) ## Then the best hyperparameters for the given model and seed size is determined using the cross_validate_model() ## function from the model object mdl.cross_validate_model() ## Following this limit_test_set() function defined in the x_to_yield_model parent class to update the ## testing_df class dataframe to reflect the 1,8,10 assays. mdl.limit_test_set([1,8,10]) ## Finally using the test_model() function from the model parent class is run to ## train the model using the hyperparameters defined above and the training data to predict the testing dataset. mdl.test_model()
for arch in arch_list: ## Then for each element in the arch_list ## Depending on which outermost iterative loop(i.e what value of b) we are in the model object we are going to create ## from the submodels_module.py program ## if b = 0, then a seqandassay_to_yield_model object is created with an assay list of [1,8,10] ## a regression model a sample_fraction determined by the iteration of arch_llist and size_list respectively. if b==0: mdl=modelbank.seqandassay_to_yield_model([1,8,10],arch,size) ## if b = 1, then a assay_to_yield_model object is created with an assay list of [1,8,10] ## a regression model a sample_fraction determined by the iteration of arch_llist and size_list respectively. elif b==1: #1,5,9,12 mdl=modelbank.assay_to_yield_model([1,8,10],arch,size) ## if b = 2, then a seq_to_yield_model object is created with a regression model a sample_fraction ## determined by the iteration of arch_llist and size_list respectively. elif b==2: mdl=modelbank.seq_to_yield_model(arch,size) ## if b = 2, then a control_to_yield_model object is created with a ridge regression model and a sample_fraction ## determined by the iteration of size_list. elif b==3: mdl=modelbank.control_to_yield_model('ridge',size) cur_cv_loss=[] cur_test_loss=[] cur_cv_loss.append(mdl.model_stats['cv_avg_loss']) cur_test_loss.append(mdl.model_stats['test_avg_loss']) ## Once the model object is created and stored on mdl, two new lists cur_cv_loss and cur_test_loss are created and the ## cv_avg_loss and test_avg_loss columns in the model_stats class dataframe are accessed and stored respectively. for seed in range(9): ## For each element in the int range of [0,9), the change_sample_seed() function of the x_to_yield_model parent class ## defined in the submodels_module.py program is run which changes the sample_seed class int to reflect the element and updates the ## trial data, model data and plots to reflect this change.