Ejemplo n.º 1
0
class BestModel(luigi.Task):
    __version__ = '0.3'
    features = luigi.DictParameter()
    guide_start = luigi.IntParameter()
    guide_length = luigi.IntParameter()
    pam_start = luigi.IntParameter()
    pam_length = luigi.IntParameter()
    activity_column = luigi.Parameter()
    kmer_column = luigi.Parameter()

    requires = task.Requires()

    example = True
    if example:
        cv_lasso = task.Requirement(
            CrossValidate,
            model_str='lasso',
            param_grid={'alpha': np.logspace(-3, 0, 100).tolist()})
    else:
        cv_gb = task.Requirement(CrossValidate,
                                 model_str='GB',
                                 param_grid={
                                     'max_depth':
                                     [int(x) for x in np.linspace(2, 40, 30)],
                                     'max_features':
                                     np.linspace(0.01, 0.3, 50).tolist(),
                                     'min_samples_split':
                                     np.linspace(0.01, 0.4, 50).tolist(),
                                     'subsample':
                                     np.linspace(0.6, 1, 50).tolist(),
                                     'alpha':
                                     np.linspace(0.5, 0.99, 50).tolist()
                                 })
    # cv_nn = task.Requirement(CrossValidate, model_str = 'NN',
    #                          param_grid = {'alpha':np.logspace(-4, -0.01, 100).tolist(),
    #                                        'learning_rate_init': np.linspace(0.001, 0.3, 50).tolist()})
    # cv_gb = task.Requirement(CrossValidate, model_str = 'GB',
    #                         param_grid = {'alpha': [0.5]})

    output = task.SaltedOutput(base_dir='data/models',
                               ext='.pickle',
                               format=luigi.format.Nop)

    def run(self):
        reqs = self.requires()
        best_fit = None
        for model, cv_x in reqs.items():
            with cv_x.output().open('rb') as f:
                cv_model = pickle.load(f)
                score = cv_model.best_score_
                curr_estimator = cv_model.best_estimator_
                if best_fit is None:
                    best_estimator = curr_estimator
                    best_fit = score
                elif best_fit < score:
                    best_estimator = curr_estimator
                    best_fit = score

        with self.output().open('wb') as f:
            pickle.dump(best_estimator, f)
Ejemplo n.º 2
0
class CrossValidate(luigi.Task):
    __version__ = '0.5'
    model_str = luigi.Parameter()
    folds = luigi.IntParameter(default=10)
    param_grid = luigi.DictParameter()
    requires = task.Requires()
    scaler = task.Requirement(Standardize,
                              activity_column='percentile',
                              kmer_column='X30mer')

    featurized = task.Requirement(FeaturizeTrain)

    output = task.SaltedOutput(base_dir='data/cv',
                               ext='.pickle',
                               format=luigi.format.Nop)

    def run(self):
        reqs = self.requires()
        featurized = reqs['featurized']
        with featurized.output().open('r') as f:
            featurized_df = pd.read_csv(f)
        with reqs['scaler'].output().open('rb') as f:
            scaler = pickle.load(f)
        y = featurized_df['activity']
        X = featurized_df[featurized_df.columns.difference(
            ['activity', 'kmer'])]
        X_train = scaler.transform(X)
        #X_train = X
        if self.model_str == 'GB':
            model = ensemble.GradientBoostingRegressor()
        elif self.model_str == 'RF':
            model = ensemble.RandomForestRegressor()
        elif self.model_str == 'lasso':
            model = linear_model.Lasso()
        elif self.model_str == 'EN':
            model = linear_model.ElasticNet()
        elif self.model_str == 'NN':
            model = neural_network.MLPRegressor()
        grid_search = model_selection.RandomizedSearchCV(
            model,
            dict(self.param_grid),
            cv=self.folds,
            scoring='neg_mean_squared_error',
            n_iter=20,
            n_jobs=1)
        grid_search.fit(X_train, y)
        # Use path because we have to write binary (stack: localTarget pickle)
        with self.output().open('wb') as f:
            pickle.dump(grid_search, f)
Ejemplo n.º 3
0
class AnalyzePredictions(luigi.Task):
    __version__ = '0.1'
    guide_start = luigi.IntParameter()
    guide_length = luigi.IntParameter()
    pam_start = luigi.IntParameter()
    pam_length = luigi.IntParameter()

    requires = task.Requires()
    azimuth_predictions = task.Requirement()
    rs2_predictions = task.Requirement(PredictModel,
                             features = {'Pos. Ind. 1mer': True,
                                          'Pos. Ind. 2mer': True,
                                          'Pos. Ind. 3mer': False,
                                          'Pos. Dep. 1mer': True,
                                          'Pos. Dep. 2mer': True,
                                          'Pos. Dep. 3mer': False,
                                          'GC content': True,
                                          'Tm': True})
    dimer_predictions = task.Requirement(PredictModel,
                             features = {'Pos. Ind. 1mer': False,
                                          'Pos. Ind. 2mer': False,
                                          'Pos. Ind. 3mer': False,
                                          'Pos. Dep. 1mer': False,
                                          'Pos. Dep. 2mer': True,
                                          'Pos. Dep. 3mer': False,
                                          'GC content': True,
                                          'Tm': False})



    output = task.SaltedOutput(base_dir='data/predictions', ext='.csv')

    def run(self):
        reqs = self.requires()
        with reqs['model'].output().open('rb') as f:
            model = pickle.load(f)
        with reqs['test_mat'].output().open('r') as f:
            test_mat = pd.read_csv(f)
        with reqs['scaler'].output().open('rb') as f:
            scaler = pickle.load(f)
        y = test_mat['activity']
        X = test_mat[test_mat.columns.difference(['activity', 'kmer'])]
        X_train = scaler.transform(X)
        predictions = model.predict(X_train)
        prediction_mat = pd.DataFrame({'kmer': test_mat['kmer'], 'true': y, 'predicted': predictions})
        with self.output().open('w') as f:
            prediction_mat.to_csv(f, index=False)
Ejemplo n.º 4
0
class PredictModel(luigi.Task):
    __version__ = '0.2'
    features = luigi.DictParameter()
    guide_start = luigi.IntParameter()
    guide_length = luigi.IntParameter()
    pam_start = luigi.IntParameter()
    pam_length = luigi.IntParameter()
    true_val = luigi.BoolParameter(default=True)

    requires = task.Requires()
    model = task.Requirement(BestModel,
                             activity_column='percentile',
                             kmer_column='X30mer')
    test_mat = task.Requirement(FeaturizeAchillesTest,
                                activity_column='sgRNA.measured.value',
                                kmer_column='X30mer')
    scaler = task.Requirement(Standardize,
                              activity_column='percentile',
                              kmer_column='X30mer')

    output = task.SaltedOutput(base_dir='data/predictions', ext='.csv')

    def run(self):
        reqs = self.requires()
        with reqs['model'].output().open('rb') as f:
            model = pickle.load(f)
        with reqs['test_mat'].output().open('r') as f:
            test_mat = pd.read_csv(f)
        with reqs['scaler'].output().open('rb') as f:
            scaler = pickle.load(f)
        y = test_mat['activity']
        X = test_mat[test_mat.columns.difference(['activity', 'kmer'])]
        X_train = scaler.transform(X)
        #X_train = X
        predictions = model.predict(X_train)
        prediction_mat = pd.DataFrame({
            'kmer': test_mat['kmer'],
            'true': y,
            'predicted': predictions
        })
        with self.output().open('w') as f:
            prediction_mat.to_csv(f, index=False)
Ejemplo n.º 5
0
class ModelCoefficients(luigi.Task):
    __version__ = '0.3'
    features = luigi.DictParameter()
    guide_start = luigi.IntParameter()
    guide_length = luigi.IntParameter()
    pam_start = luigi.IntParameter()
    pam_length = luigi.IntParameter()

    requires = task.Requires()

    model = task.Requirement(BestModel,
                             activity_column='percentile',
                             kmer_column='X30mer')
    scaler = task.Requirement(Standardize,
                              activity_column='percentile',
                              kmer_column='X30mer')
    train_mat = task.Requirement(FeaturizeTrain,
                                 activity_column='percentile',
                                 kmer_column='X30mer')
    output = task.SaltedOutput(base_dir='data/models', ext='.csv')

    def run(self):
        reqs = self.requires()
        with reqs['model'].output().open('rb') as f:
            model = pickle.load(f)
        with reqs['train_mat'].output().open('r') as f:
            train_mat = pd.read_csv(f)
        with reqs['scaler'].output().open('r') as f:
            scaler = pickle.load(f)
        X = train_mat[train_mat.columns.difference(['activity', 'kmer'])]
        if model.__class__ == ensemble.GradientBoostingRegressor:
            importances = model.feature_importances_
        elif model.__class__ == linear_model.Lasso:
            importances = score_coefs(scaler.transform(X),
                                      train_mat['activity'], model.coef_,
                                      model.intercept_)
        feature_importances = pd.DataFrame({
            'feature': X.keys(),
            'importance': importances
        })
        with self.output().open('w') as f:
            feature_importances.to_csv(f, index=False)
Ejemplo n.º 6
0
class FilteredAchillesData(luigi.Task):
    __version__ = '0.1'
    requires = task.Requires()
    achilles_file = task.Requirement(AchillesTestData)

    oof_gv2_file = task.Requirement(OofGv2)

    output = task.SaltedOutput(base_dir='./data/filtered', ext='.csv')

    def run(self):
        with self.achilles_file.output().open('r') as f:
            achilles_data = pd.read_csv(f)
        with self.oof_gv2_file.output().open('r') as f:
            oof_gv2_data = pd.read_csv(f)
        achilles_oof_data = (pd.merge(
            achilles_data,
            oof_gv2_data.drop_duplicates(),
            how='inner',
            on='X30mer').drop(['X', 'Unnamed: 0'], axis=1).drop_duplicates())
        with self.output().open('w') as f:
            achilles_oof_data.to_csv(f)
Ejemplo n.º 7
0
class FilteredRS2Data(luigi.Task):
    __version__ = '0.1'
    requires = task.Requires()
    rs2_file = task.Requirement(RS2CombData)
    oof_fc_file = task.Requirement(OofFc)
    oof_res_file = task.Requirement(OofRes)

    output = task.SaltedOutput(base_dir='./data/filtered', ext='.csv')

    def run(self):
        with self.rs2_file.output().open('r') as f:
            rs2_data = pd.read_csv(f)
        with self.oof_fc_file.output().open('r') as f:
            oof_fc_data = pd.read_csv(f)
        with self.oof_res_file.output().open('r') as f:
            oof_res_data = pd.read_csv(f)
        rs2_oof_data = (pd.merge(
            oof_res_data[['30mer', 'OOF mutation rate']],
            oof_fc_data[['30mer', 'OOF mutation rate']],
            how='outer').merge(rs2_data, how='inner',
                               on='30mer').drop_duplicates())

        with self.output().open('w') as f:
            rs2_oof_data.to_csv(f)
Ejemplo n.º 8
0
class Fasta(luigi.Task):
    __version__ = '0.1'
    seq_col = luigi.Parameter()
    requires = task.Requires()
    seq_data = task.Requirement(RS2CombData)

    output = task.SaltedOutput(base_dir='./data/raw', ext='.FASTA')

    def run(self):
        reqs = self.requires()
        with reqs['seq_data'].output().open('r') as f:
            seq_data = pd.read_csv(f)
        seqs = seq_data[self.seq_col]
        with self.output().open('w') as f:
            for seq in seqs:
                f.write('>' + seq + '\n')
                f.write(seq + '\n')
Ejemplo n.º 9
0
class FeaturizeTrain(BaseFeaturize):
    # ["Cd28", "Cd3e", "CD45", "Cd5", "Cd43", "H2-K", "Thy1",
    #  "CD13_TF-1", "CD13_NB4", "CD33_NB4", "CD33_MOLM-13",
    #  "CD15_MOLM-13", "CCDC101_AZD", "CUL3_PLX",
    #  "HPRT1_6TG", "MED12_AZD", "MED12_PLX", "NF1_PLX", "NF2_PLX",
    #  "TADA1_AZD", "TADA2B_AZD"]

    filtered = task.Requirement(FilteredRS3Data,
                                assays = ["CD45", "Cd28", "Cd5", "Cd43", "H2-K", "Thy1",
                                          "CD13_TF-1", "CD33_MOLM-13",
                                          "CD15_MOLM-13", "CCDC101_AZD",
                                          "HPRT1_6TG", "MED12_AZD", "NF1_PLX", "NF2_PLX",
                                          "TADA1_AZD", "TADA2B_AZD"],
                                assays_end = ["CD45", "Cd5", "Cd43", "H2-K", "Thy1",
                                               "CD33_MOLM-13", "HPRT1_6TG", "MED12_AZD",
                                                "NF1_PLX", "NF2_PLX", "TADA2B_AZD"]
,
                                assays_start = ["CD13_TF-1","CD15_MOLM-13", "CD45"],
                                perc_pep_end = 80, perc_pep_start = 20)
Ejemplo n.º 10
0
class FilteredRS3Data(luigi.Task):
    __version__ = '0.3'
    requires = task.Requires()
    rs3_file = task.Requirement(RS3Train)
    assays = luigi.ListParameter()
    assays_end = luigi.ListParameter()
    assays_start = luigi.ListParameter()
    perc_pep_end = luigi.IntParameter()
    perc_pep_start = luigi.IntParameter()
    output = task.SaltedOutput(base_dir='./data/filtered', ext='.csv')

    def run(self):
        with self.rs3_file.output().open('r') as f:
            rs3_data = pd.read_csv(f)
        filtered_rs3_data = rs3_data[(rs3_data.Assay_ID.isin(self.assays)) & (
            ((rs3_data.Target_Cut < self.perc_pep_end) &
             (rs3_data.Assay_ID.isin(self.assays_end)))
            | ~rs3_data.Assay_ID.isin(self.assays_end)) & (
                ((rs3_data.Target_Cut > self.perc_pep_start)
                 & rs3_data.Assay_ID.isin(self.assays_start))
                | ~rs3_data.Assay_ID.isin(self.assays_start))]
        with self.output().open('w') as f:
            filtered_rs3_data.to_csv(f)
Ejemplo n.º 11
0
class Standardize(luigi.Task):
    __version__ = '0.1'
    activity_column = luigi.Parameter()
    kmer_column = luigi.Parameter()
    features = luigi.DictParameter()
    guide_start = luigi.IntParameter()
    guide_length = luigi.IntParameter()
    pam_start = luigi.IntParameter()
    pam_length = luigi.IntParameter()

    requires = task.Requires()
    featurized = task.Requirement(FeaturizeTrain)

    output = task.SaltedOutput(base_dir='data/featurized', ext='.csv',
                               format=luigi.format.Nop)

    def run(self):
        reqs = self.requires()
        with reqs['featurized'].output().open('r') as f:
            test_mat = pd.read_csv(f)
        X = test_mat[test_mat.columns.difference(['activity', 'kmer'])]
        scaler = preprocessing.StandardScaler().fit(X)
        with self.output().open('wb') as f:
            pickle.dump(scaler, f)
Ejemplo n.º 12
0
class FeaturizeAchillesTest(BaseFeaturize):

    filtered = task.Requirement(Gv2Test)
Ejemplo n.º 13
0
class FeaturizeDoenchTest(BaseFeaturize):

    filtered = task.Requirement(DoenchTestData)