class BaseFeaturize(luigi.Task): __version__ = '0.8' activity_column = luigi.Parameter() kmer_column = luigi.Parameter() features = luigi.DictParameter() pam_start = luigi.IntParameter() pam_length = luigi.IntParameter() guide_start = luigi.IntParameter() guide_length = luigi.IntParameter() requires = task.Requires() output = task.SaltedOutput(base_dir='data/featurized', ext='.csv') def run(self): reqs = self.requires() interim_target = reqs['filtered'].output() with interim_target.open('r') as interim_file: interim_mat = pd.read_csv(interim_file) kmers = interim_mat[self.kmer_column] featurized_kmers = featurize_guides(kmers, self.features, self.pam_start, self.pam_length, self.guide_start, self.guide_length, oof_mutation_rates=interim_mat['OOF mutation rate']) featurized_kmers['activity'] = interim_mat[self.activity_column] featurized_kmers['kmer'] = interim_mat[self.kmer_column] with self.output().open('w') as f: featurized_kmers.to_csv(f, index=False)
class BestModel(luigi.Task): __version__ = '0.3' features = luigi.DictParameter() guide_start = luigi.IntParameter() guide_length = luigi.IntParameter() pam_start = luigi.IntParameter() pam_length = luigi.IntParameter() activity_column = luigi.Parameter() kmer_column = luigi.Parameter() requires = task.Requires() example = True if example: cv_lasso = task.Requirement( CrossValidate, model_str='lasso', param_grid={'alpha': np.logspace(-3, 0, 100).tolist()}) else: cv_gb = task.Requirement(CrossValidate, model_str='GB', param_grid={ 'max_depth': [int(x) for x in np.linspace(2, 40, 30)], 'max_features': np.linspace(0.01, 0.3, 50).tolist(), 'min_samples_split': np.linspace(0.01, 0.4, 50).tolist(), 'subsample': np.linspace(0.6, 1, 50).tolist(), 'alpha': np.linspace(0.5, 0.99, 50).tolist() }) # cv_nn = task.Requirement(CrossValidate, model_str = 'NN', # param_grid = {'alpha':np.logspace(-4, -0.01, 100).tolist(), # 'learning_rate_init': np.linspace(0.001, 0.3, 50).tolist()}) # cv_gb = task.Requirement(CrossValidate, model_str = 'GB', # param_grid = {'alpha': [0.5]}) output = task.SaltedOutput(base_dir='data/models', ext='.pickle', format=luigi.format.Nop) def run(self): reqs = self.requires() best_fit = None for model, cv_x in reqs.items(): with cv_x.output().open('rb') as f: cv_model = pickle.load(f) score = cv_model.best_score_ curr_estimator = cv_model.best_estimator_ if best_fit is None: best_estimator = curr_estimator best_fit = score elif best_fit < score: best_estimator = curr_estimator best_fit = score with self.output().open('wb') as f: pickle.dump(best_estimator, f)
class CrossValidate(luigi.Task): __version__ = '0.5' model_str = luigi.Parameter() folds = luigi.IntParameter(default=10) param_grid = luigi.DictParameter() requires = task.Requires() scaler = task.Requirement(Standardize, activity_column='percentile', kmer_column='X30mer') featurized = task.Requirement(FeaturizeTrain) output = task.SaltedOutput(base_dir='data/cv', ext='.pickle', format=luigi.format.Nop) def run(self): reqs = self.requires() featurized = reqs['featurized'] with featurized.output().open('r') as f: featurized_df = pd.read_csv(f) with reqs['scaler'].output().open('rb') as f: scaler = pickle.load(f) y = featurized_df['activity'] X = featurized_df[featurized_df.columns.difference( ['activity', 'kmer'])] X_train = scaler.transform(X) #X_train = X if self.model_str == 'GB': model = ensemble.GradientBoostingRegressor() elif self.model_str == 'RF': model = ensemble.RandomForestRegressor() elif self.model_str == 'lasso': model = linear_model.Lasso() elif self.model_str == 'EN': model = linear_model.ElasticNet() elif self.model_str == 'NN': model = neural_network.MLPRegressor() grid_search = model_selection.RandomizedSearchCV( model, dict(self.param_grid), cv=self.folds, scoring='neg_mean_squared_error', n_iter=20, n_jobs=1) grid_search.fit(X_train, y) # Use path because we have to write binary (stack: localTarget pickle) with self.output().open('wb') as f: pickle.dump(grid_search, f)
class AnalyzePredictions(luigi.Task): __version__ = '0.1' guide_start = luigi.IntParameter() guide_length = luigi.IntParameter() pam_start = luigi.IntParameter() pam_length = luigi.IntParameter() requires = task.Requires() azimuth_predictions = task.Requirement() rs2_predictions = task.Requirement(PredictModel, features = {'Pos. Ind. 1mer': True, 'Pos. Ind. 2mer': True, 'Pos. Ind. 3mer': False, 'Pos. Dep. 1mer': True, 'Pos. Dep. 2mer': True, 'Pos. Dep. 3mer': False, 'GC content': True, 'Tm': True}) dimer_predictions = task.Requirement(PredictModel, features = {'Pos. Ind. 1mer': False, 'Pos. Ind. 2mer': False, 'Pos. Ind. 3mer': False, 'Pos. Dep. 1mer': False, 'Pos. Dep. 2mer': True, 'Pos. Dep. 3mer': False, 'GC content': True, 'Tm': False}) output = task.SaltedOutput(base_dir='data/predictions', ext='.csv') def run(self): reqs = self.requires() with reqs['model'].output().open('rb') as f: model = pickle.load(f) with reqs['test_mat'].output().open('r') as f: test_mat = pd.read_csv(f) with reqs['scaler'].output().open('rb') as f: scaler = pickle.load(f) y = test_mat['activity'] X = test_mat[test_mat.columns.difference(['activity', 'kmer'])] X_train = scaler.transform(X) predictions = model.predict(X_train) prediction_mat = pd.DataFrame({'kmer': test_mat['kmer'], 'true': y, 'predicted': predictions}) with self.output().open('w') as f: prediction_mat.to_csv(f, index=False)
class Fasta(luigi.Task): __version__ = '0.1' seq_col = luigi.Parameter() requires = task.Requires() seq_data = task.Requirement(RS2CombData) output = task.SaltedOutput(base_dir='./data/raw', ext='.FASTA') def run(self): reqs = self.requires() with reqs['seq_data'].output().open('r') as f: seq_data = pd.read_csv(f) seqs = seq_data[self.seq_col] with self.output().open('w') as f: for seq in seqs: f.write('>' + seq + '\n') f.write(seq + '\n')
class PredictModel(luigi.Task): __version__ = '0.2' features = luigi.DictParameter() guide_start = luigi.IntParameter() guide_length = luigi.IntParameter() pam_start = luigi.IntParameter() pam_length = luigi.IntParameter() true_val = luigi.BoolParameter(default=True) requires = task.Requires() model = task.Requirement(BestModel, activity_column='percentile', kmer_column='X30mer') test_mat = task.Requirement(FeaturizeAchillesTest, activity_column='sgRNA.measured.value', kmer_column='X30mer') scaler = task.Requirement(Standardize, activity_column='percentile', kmer_column='X30mer') output = task.SaltedOutput(base_dir='data/predictions', ext='.csv') def run(self): reqs = self.requires() with reqs['model'].output().open('rb') as f: model = pickle.load(f) with reqs['test_mat'].output().open('r') as f: test_mat = pd.read_csv(f) with reqs['scaler'].output().open('rb') as f: scaler = pickle.load(f) y = test_mat['activity'] X = test_mat[test_mat.columns.difference(['activity', 'kmer'])] X_train = scaler.transform(X) #X_train = X predictions = model.predict(X_train) prediction_mat = pd.DataFrame({ 'kmer': test_mat['kmer'], 'true': y, 'predicted': predictions }) with self.output().open('w') as f: prediction_mat.to_csv(f, index=False)
class ModelCoefficients(luigi.Task): __version__ = '0.3' features = luigi.DictParameter() guide_start = luigi.IntParameter() guide_length = luigi.IntParameter() pam_start = luigi.IntParameter() pam_length = luigi.IntParameter() requires = task.Requires() model = task.Requirement(BestModel, activity_column='percentile', kmer_column='X30mer') scaler = task.Requirement(Standardize, activity_column='percentile', kmer_column='X30mer') train_mat = task.Requirement(FeaturizeTrain, activity_column='percentile', kmer_column='X30mer') output = task.SaltedOutput(base_dir='data/models', ext='.csv') def run(self): reqs = self.requires() with reqs['model'].output().open('rb') as f: model = pickle.load(f) with reqs['train_mat'].output().open('r') as f: train_mat = pd.read_csv(f) with reqs['scaler'].output().open('r') as f: scaler = pickle.load(f) X = train_mat[train_mat.columns.difference(['activity', 'kmer'])] if model.__class__ == ensemble.GradientBoostingRegressor: importances = model.feature_importances_ elif model.__class__ == linear_model.Lasso: importances = score_coefs(scaler.transform(X), train_mat['activity'], model.coef_, model.intercept_) feature_importances = pd.DataFrame({ 'feature': X.keys(), 'importance': importances }) with self.output().open('w') as f: feature_importances.to_csv(f, index=False)
class FilteredAchillesData(luigi.Task): __version__ = '0.1' requires = task.Requires() achilles_file = task.Requirement(AchillesTestData) oof_gv2_file = task.Requirement(OofGv2) output = task.SaltedOutput(base_dir='./data/filtered', ext='.csv') def run(self): with self.achilles_file.output().open('r') as f: achilles_data = pd.read_csv(f) with self.oof_gv2_file.output().open('r') as f: oof_gv2_data = pd.read_csv(f) achilles_oof_data = (pd.merge( achilles_data, oof_gv2_data.drop_duplicates(), how='inner', on='X30mer').drop(['X', 'Unnamed: 0'], axis=1).drop_duplicates()) with self.output().open('w') as f: achilles_oof_data.to_csv(f)
class FilteredRS3Data(luigi.Task): __version__ = '0.3' requires = task.Requires() rs3_file = task.Requirement(RS3Train) assays = luigi.ListParameter() assays_end = luigi.ListParameter() assays_start = luigi.ListParameter() perc_pep_end = luigi.IntParameter() perc_pep_start = luigi.IntParameter() output = task.SaltedOutput(base_dir='./data/filtered', ext='.csv') def run(self): with self.rs3_file.output().open('r') as f: rs3_data = pd.read_csv(f) filtered_rs3_data = rs3_data[(rs3_data.Assay_ID.isin(self.assays)) & ( ((rs3_data.Target_Cut < self.perc_pep_end) & (rs3_data.Assay_ID.isin(self.assays_end))) | ~rs3_data.Assay_ID.isin(self.assays_end)) & ( ((rs3_data.Target_Cut > self.perc_pep_start) & rs3_data.Assay_ID.isin(self.assays_start)) | ~rs3_data.Assay_ID.isin(self.assays_start))] with self.output().open('w') as f: filtered_rs3_data.to_csv(f)
class FilteredRS2Data(luigi.Task): __version__ = '0.1' requires = task.Requires() rs2_file = task.Requirement(RS2CombData) oof_fc_file = task.Requirement(OofFc) oof_res_file = task.Requirement(OofRes) output = task.SaltedOutput(base_dir='./data/filtered', ext='.csv') def run(self): with self.rs2_file.output().open('r') as f: rs2_data = pd.read_csv(f) with self.oof_fc_file.output().open('r') as f: oof_fc_data = pd.read_csv(f) with self.oof_res_file.output().open('r') as f: oof_res_data = pd.read_csv(f) rs2_oof_data = (pd.merge( oof_res_data[['30mer', 'OOF mutation rate']], oof_fc_data[['30mer', 'OOF mutation rate']], how='outer').merge(rs2_data, how='inner', on='30mer').drop_duplicates()) with self.output().open('w') as f: rs2_oof_data.to_csv(f)
class Standardize(luigi.Task): __version__ = '0.1' activity_column = luigi.Parameter() kmer_column = luigi.Parameter() features = luigi.DictParameter() guide_start = luigi.IntParameter() guide_length = luigi.IntParameter() pam_start = luigi.IntParameter() pam_length = luigi.IntParameter() requires = task.Requires() featurized = task.Requirement(FeaturizeTrain) output = task.SaltedOutput(base_dir='data/featurized', ext='.csv', format=luigi.format.Nop) def run(self): reqs = self.requires() with reqs['featurized'].output().open('r') as f: test_mat = pd.read_csv(f) X = test_mat[test_mat.columns.difference(['activity', 'kmer'])] scaler = preprocessing.StandardScaler().fit(X) with self.output().open('wb') as f: pickle.dump(scaler, f)