Beispiel #1
0
class BaseFeaturize(luigi.Task):
    __version__ = '0.8'
    activity_column = luigi.Parameter()
    kmer_column = luigi.Parameter()
    features = luigi.DictParameter()
    pam_start = luigi.IntParameter()
    pam_length = luigi.IntParameter()
    guide_start = luigi.IntParameter()
    guide_length = luigi.IntParameter()

    requires = task.Requires()
    output = task.SaltedOutput(base_dir='data/featurized', ext='.csv')

    def run(self):
        reqs = self.requires()
        interim_target = reqs['filtered'].output()
        with interim_target.open('r') as interim_file:
            interim_mat = pd.read_csv(interim_file)
        kmers = interim_mat[self.kmer_column]
        featurized_kmers = featurize_guides(kmers, self.features, self.pam_start,
                                            self.pam_length, self.guide_start,
                                            self.guide_length,
                                            oof_mutation_rates=interim_mat['OOF mutation rate'])
        featurized_kmers['activity'] = interim_mat[self.activity_column]
        featurized_kmers['kmer'] = interim_mat[self.kmer_column]
        with self.output().open('w') as f:
            featurized_kmers.to_csv(f, index=False)
class BestModel(luigi.Task):
    __version__ = '0.3'
    features = luigi.DictParameter()
    guide_start = luigi.IntParameter()
    guide_length = luigi.IntParameter()
    pam_start = luigi.IntParameter()
    pam_length = luigi.IntParameter()
    activity_column = luigi.Parameter()
    kmer_column = luigi.Parameter()

    requires = task.Requires()

    example = True
    if example:
        cv_lasso = task.Requirement(
            CrossValidate,
            model_str='lasso',
            param_grid={'alpha': np.logspace(-3, 0, 100).tolist()})
    else:
        cv_gb = task.Requirement(CrossValidate,
                                 model_str='GB',
                                 param_grid={
                                     'max_depth':
                                     [int(x) for x in np.linspace(2, 40, 30)],
                                     'max_features':
                                     np.linspace(0.01, 0.3, 50).tolist(),
                                     'min_samples_split':
                                     np.linspace(0.01, 0.4, 50).tolist(),
                                     'subsample':
                                     np.linspace(0.6, 1, 50).tolist(),
                                     'alpha':
                                     np.linspace(0.5, 0.99, 50).tolist()
                                 })
    # cv_nn = task.Requirement(CrossValidate, model_str = 'NN',
    #                          param_grid = {'alpha':np.logspace(-4, -0.01, 100).tolist(),
    #                                        'learning_rate_init': np.linspace(0.001, 0.3, 50).tolist()})
    # cv_gb = task.Requirement(CrossValidate, model_str = 'GB',
    #                         param_grid = {'alpha': [0.5]})

    output = task.SaltedOutput(base_dir='data/models',
                               ext='.pickle',
                               format=luigi.format.Nop)

    def run(self):
        reqs = self.requires()
        best_fit = None
        for model, cv_x in reqs.items():
            with cv_x.output().open('rb') as f:
                cv_model = pickle.load(f)
                score = cv_model.best_score_
                curr_estimator = cv_model.best_estimator_
                if best_fit is None:
                    best_estimator = curr_estimator
                    best_fit = score
                elif best_fit < score:
                    best_estimator = curr_estimator
                    best_fit = score

        with self.output().open('wb') as f:
            pickle.dump(best_estimator, f)
class CrossValidate(luigi.Task):
    __version__ = '0.5'
    model_str = luigi.Parameter()
    folds = luigi.IntParameter(default=10)
    param_grid = luigi.DictParameter()
    requires = task.Requires()
    scaler = task.Requirement(Standardize,
                              activity_column='percentile',
                              kmer_column='X30mer')

    featurized = task.Requirement(FeaturizeTrain)

    output = task.SaltedOutput(base_dir='data/cv',
                               ext='.pickle',
                               format=luigi.format.Nop)

    def run(self):
        reqs = self.requires()
        featurized = reqs['featurized']
        with featurized.output().open('r') as f:
            featurized_df = pd.read_csv(f)
        with reqs['scaler'].output().open('rb') as f:
            scaler = pickle.load(f)
        y = featurized_df['activity']
        X = featurized_df[featurized_df.columns.difference(
            ['activity', 'kmer'])]
        X_train = scaler.transform(X)
        #X_train = X
        if self.model_str == 'GB':
            model = ensemble.GradientBoostingRegressor()
        elif self.model_str == 'RF':
            model = ensemble.RandomForestRegressor()
        elif self.model_str == 'lasso':
            model = linear_model.Lasso()
        elif self.model_str == 'EN':
            model = linear_model.ElasticNet()
        elif self.model_str == 'NN':
            model = neural_network.MLPRegressor()
        grid_search = model_selection.RandomizedSearchCV(
            model,
            dict(self.param_grid),
            cv=self.folds,
            scoring='neg_mean_squared_error',
            n_iter=20,
            n_jobs=1)
        grid_search.fit(X_train, y)
        # Use path because we have to write binary (stack: localTarget pickle)
        with self.output().open('wb') as f:
            pickle.dump(grid_search, f)
Beispiel #4
0
class AnalyzePredictions(luigi.Task):
    __version__ = '0.1'
    guide_start = luigi.IntParameter()
    guide_length = luigi.IntParameter()
    pam_start = luigi.IntParameter()
    pam_length = luigi.IntParameter()

    requires = task.Requires()
    azimuth_predictions = task.Requirement()
    rs2_predictions = task.Requirement(PredictModel,
                             features = {'Pos. Ind. 1mer': True,
                                          'Pos. Ind. 2mer': True,
                                          'Pos. Ind. 3mer': False,
                                          'Pos. Dep. 1mer': True,
                                          'Pos. Dep. 2mer': True,
                                          'Pos. Dep. 3mer': False,
                                          'GC content': True,
                                          'Tm': True})
    dimer_predictions = task.Requirement(PredictModel,
                             features = {'Pos. Ind. 1mer': False,
                                          'Pos. Ind. 2mer': False,
                                          'Pos. Ind. 3mer': False,
                                          'Pos. Dep. 1mer': False,
                                          'Pos. Dep. 2mer': True,
                                          'Pos. Dep. 3mer': False,
                                          'GC content': True,
                                          'Tm': False})



    output = task.SaltedOutput(base_dir='data/predictions', ext='.csv')

    def run(self):
        reqs = self.requires()
        with reqs['model'].output().open('rb') as f:
            model = pickle.load(f)
        with reqs['test_mat'].output().open('r') as f:
            test_mat = pd.read_csv(f)
        with reqs['scaler'].output().open('rb') as f:
            scaler = pickle.load(f)
        y = test_mat['activity']
        X = test_mat[test_mat.columns.difference(['activity', 'kmer'])]
        X_train = scaler.transform(X)
        predictions = model.predict(X_train)
        prediction_mat = pd.DataFrame({'kmer': test_mat['kmer'], 'true': y, 'predicted': predictions})
        with self.output().open('w') as f:
            prediction_mat.to_csv(f, index=False)
class Fasta(luigi.Task):
    __version__ = '0.1'
    seq_col = luigi.Parameter()
    requires = task.Requires()
    seq_data = task.Requirement(RS2CombData)

    output = task.SaltedOutput(base_dir='./data/raw', ext='.FASTA')

    def run(self):
        reqs = self.requires()
        with reqs['seq_data'].output().open('r') as f:
            seq_data = pd.read_csv(f)
        seqs = seq_data[self.seq_col]
        with self.output().open('w') as f:
            for seq in seqs:
                f.write('>' + seq + '\n')
                f.write(seq + '\n')
class PredictModel(luigi.Task):
    __version__ = '0.2'
    features = luigi.DictParameter()
    guide_start = luigi.IntParameter()
    guide_length = luigi.IntParameter()
    pam_start = luigi.IntParameter()
    pam_length = luigi.IntParameter()
    true_val = luigi.BoolParameter(default=True)

    requires = task.Requires()
    model = task.Requirement(BestModel,
                             activity_column='percentile',
                             kmer_column='X30mer')
    test_mat = task.Requirement(FeaturizeAchillesTest,
                                activity_column='sgRNA.measured.value',
                                kmer_column='X30mer')
    scaler = task.Requirement(Standardize,
                              activity_column='percentile',
                              kmer_column='X30mer')

    output = task.SaltedOutput(base_dir='data/predictions', ext='.csv')

    def run(self):
        reqs = self.requires()
        with reqs['model'].output().open('rb') as f:
            model = pickle.load(f)
        with reqs['test_mat'].output().open('r') as f:
            test_mat = pd.read_csv(f)
        with reqs['scaler'].output().open('rb') as f:
            scaler = pickle.load(f)
        y = test_mat['activity']
        X = test_mat[test_mat.columns.difference(['activity', 'kmer'])]
        X_train = scaler.transform(X)
        #X_train = X
        predictions = model.predict(X_train)
        prediction_mat = pd.DataFrame({
            'kmer': test_mat['kmer'],
            'true': y,
            'predicted': predictions
        })
        with self.output().open('w') as f:
            prediction_mat.to_csv(f, index=False)
class ModelCoefficients(luigi.Task):
    __version__ = '0.3'
    features = luigi.DictParameter()
    guide_start = luigi.IntParameter()
    guide_length = luigi.IntParameter()
    pam_start = luigi.IntParameter()
    pam_length = luigi.IntParameter()

    requires = task.Requires()

    model = task.Requirement(BestModel,
                             activity_column='percentile',
                             kmer_column='X30mer')
    scaler = task.Requirement(Standardize,
                              activity_column='percentile',
                              kmer_column='X30mer')
    train_mat = task.Requirement(FeaturizeTrain,
                                 activity_column='percentile',
                                 kmer_column='X30mer')
    output = task.SaltedOutput(base_dir='data/models', ext='.csv')

    def run(self):
        reqs = self.requires()
        with reqs['model'].output().open('rb') as f:
            model = pickle.load(f)
        with reqs['train_mat'].output().open('r') as f:
            train_mat = pd.read_csv(f)
        with reqs['scaler'].output().open('r') as f:
            scaler = pickle.load(f)
        X = train_mat[train_mat.columns.difference(['activity', 'kmer'])]
        if model.__class__ == ensemble.GradientBoostingRegressor:
            importances = model.feature_importances_
        elif model.__class__ == linear_model.Lasso:
            importances = score_coefs(scaler.transform(X),
                                      train_mat['activity'], model.coef_,
                                      model.intercept_)
        feature_importances = pd.DataFrame({
            'feature': X.keys(),
            'importance': importances
        })
        with self.output().open('w') as f:
            feature_importances.to_csv(f, index=False)
class FilteredAchillesData(luigi.Task):
    __version__ = '0.1'
    requires = task.Requires()
    achilles_file = task.Requirement(AchillesTestData)

    oof_gv2_file = task.Requirement(OofGv2)

    output = task.SaltedOutput(base_dir='./data/filtered', ext='.csv')

    def run(self):
        with self.achilles_file.output().open('r') as f:
            achilles_data = pd.read_csv(f)
        with self.oof_gv2_file.output().open('r') as f:
            oof_gv2_data = pd.read_csv(f)
        achilles_oof_data = (pd.merge(
            achilles_data,
            oof_gv2_data.drop_duplicates(),
            how='inner',
            on='X30mer').drop(['X', 'Unnamed: 0'], axis=1).drop_duplicates())
        with self.output().open('w') as f:
            achilles_oof_data.to_csv(f)
class FilteredRS3Data(luigi.Task):
    __version__ = '0.3'
    requires = task.Requires()
    rs3_file = task.Requirement(RS3Train)
    assays = luigi.ListParameter()
    assays_end = luigi.ListParameter()
    assays_start = luigi.ListParameter()
    perc_pep_end = luigi.IntParameter()
    perc_pep_start = luigi.IntParameter()
    output = task.SaltedOutput(base_dir='./data/filtered', ext='.csv')

    def run(self):
        with self.rs3_file.output().open('r') as f:
            rs3_data = pd.read_csv(f)
        filtered_rs3_data = rs3_data[(rs3_data.Assay_ID.isin(self.assays)) & (
            ((rs3_data.Target_Cut < self.perc_pep_end) &
             (rs3_data.Assay_ID.isin(self.assays_end)))
            | ~rs3_data.Assay_ID.isin(self.assays_end)) & (
                ((rs3_data.Target_Cut > self.perc_pep_start)
                 & rs3_data.Assay_ID.isin(self.assays_start))
                | ~rs3_data.Assay_ID.isin(self.assays_start))]
        with self.output().open('w') as f:
            filtered_rs3_data.to_csv(f)
class FilteredRS2Data(luigi.Task):
    __version__ = '0.1'
    requires = task.Requires()
    rs2_file = task.Requirement(RS2CombData)
    oof_fc_file = task.Requirement(OofFc)
    oof_res_file = task.Requirement(OofRes)

    output = task.SaltedOutput(base_dir='./data/filtered', ext='.csv')

    def run(self):
        with self.rs2_file.output().open('r') as f:
            rs2_data = pd.read_csv(f)
        with self.oof_fc_file.output().open('r') as f:
            oof_fc_data = pd.read_csv(f)
        with self.oof_res_file.output().open('r') as f:
            oof_res_data = pd.read_csv(f)
        rs2_oof_data = (pd.merge(
            oof_res_data[['30mer', 'OOF mutation rate']],
            oof_fc_data[['30mer', 'OOF mutation rate']],
            how='outer').merge(rs2_data, how='inner',
                               on='30mer').drop_duplicates())

        with self.output().open('w') as f:
            rs2_oof_data.to_csv(f)
Beispiel #11
0
class Standardize(luigi.Task):
    __version__ = '0.1'
    activity_column = luigi.Parameter()
    kmer_column = luigi.Parameter()
    features = luigi.DictParameter()
    guide_start = luigi.IntParameter()
    guide_length = luigi.IntParameter()
    pam_start = luigi.IntParameter()
    pam_length = luigi.IntParameter()

    requires = task.Requires()
    featurized = task.Requirement(FeaturizeTrain)

    output = task.SaltedOutput(base_dir='data/featurized', ext='.csv',
                               format=luigi.format.Nop)

    def run(self):
        reqs = self.requires()
        with reqs['featurized'].output().open('r') as f:
            test_mat = pd.read_csv(f)
        X = test_mat[test_mat.columns.difference(['activity', 'kmer'])]
        scaler = preprocessing.StandardScaler().fit(X)
        with self.output().open('wb') as f:
            pickle.dump(scaler, f)