예제 #1
0
def SVC_fill(feature, df, metric='auc', test_size=.3):
    '''
    ###Support Vector Inference 
    * Using [SVC](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html) 
    function from [Scikit Learn Package](http://scikit-learn.org/stable/) 
    * Features are the binarized differential expression vectors  
      * Have high change in expression from tumor to normal 
      * Thresholded at 1 standard deviation over the mean to reduce overfitting
    * Parameters are fit using cross validation, optimizing for AUC score 
      * I try linear, RBF, and polynomial kernels under a variety of parameters 
      * The best model in cross validation is fit on the entire dataset 
    * Missing values are filled in based on the model prediction
    '''
    gg = df.apply(lambda s: to_quants(s, std=1) > 0, axis=1)
    mat = gg
    
    pats = gg.columns.intersection(feature.index)
    X = mat.ix[:, pats].T.as_matrix()
    Y = (feature)
    Y = np.array(Y.ix[pats])
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=test_size, random_state=5796503)
    
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [0, .1, .05, .01, 1e-3, 1e-4, 1e-5],
                         'C': [.1, 1, 10, 100, 1000], 'class_weight': ['auto']},
                        {'kernel': ['linear'], 'C': [1, 10, 100, 1000], 'class_weight': ['auto']},
                        {'kernel': ['poly'], 'C': [1, 10, 100, 1000], 'class_weight': ['auto']}]
    
    if metric == 'auc':
        metric = auc_score
    elif metric == 'precision':
        metric = precision_score
    
    clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=metric)
    clf.fit(X_train, y_train, cv=5);
    best = clf.best_estimator_
    auc = clf.score(X, Y)
    
    mat_all = gg.ix[mat.index].T.as_matrix()
    inferred = best.predict(mat_all)
    inferred = pd.Series(inferred, index=gg.columns)
    fun = pd.Series(best.decision_function(mat_all)[:, 0], mat.columns)
    f = feature.copy()
    f = f.ix[inferred.index]
    f[f.isnull()] = inferred[f.isnull()]
    filled_feature = f.astype(float)
    return {'auc': auc, 'model': best, 'decision_function': fun,
            'inferred_values': inferred, 'filled_feature': filled_feature} 
예제 #2
0
파일: R_Wrappers.py 프로젝트: Krysia/TCGA
 def plot_me(sub_f, label):
     if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10):
         sub_f = to_quants(sub_f, q=q, std=std)
         
     m = get_cox_ph(surv, sub_f, formula=fmla)
     r_data = m.rx2('call')[2]
     p = log_rank(sub_f, surv)['p']
     ls = r.c(*colors)
     
     r.plot(survival.survfit(fmla, r_data), lty=1, col=ls, lwd=4, cex=1.25,
                             xlab='Years to Event', ylab='Survival');
     r.title(label, cex=3.)
     if ann == 'p':
         r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4)
     elif ann != None:
         r.text(0, labels=ann, pos=4)
예제 #3
0
    def plot_me(sub_f, label):
        if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10):
            sub_f = to_quants(sub_f, q=q, std=std)

        m = get_cox_ph(surv, sub_f, formula=fmla)
        r_data = m.rx2('call')[2]
        p = log_rank(sub_f, surv)['p']
        ls = r.c(*colors)

        r.plot(survival.survfit(fmla, r_data),
               lty=1,
               col=ls,
               lwd=4,
               cex=1.25,
               xlab='Years to Event',
               ylab='Survival')
        r.title(label, cex=3.)
        if ann == 'p':
            r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4)
        elif ann != None:
            r.text(0, labels=ann, pos=4)
예제 #4
0
def SVC_fill_old(feature, df):
    gg = df.apply(lambda s: to_quants(s, std=1) > 0)
    diff = screen_feature(feature, chi2_cont_test, gg)
    dd = diff[diff.p < .05]
    
    pats = gg.columns.intersection(feature.index)
    mat = gg.ix[dd.index]
    X = mat.ix[:, pats].T.as_matrix()
    Y = (feature)
    Y = np.array(Y.ix[pats])
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=0.35, random_state=5796543)
    
    params = [{'kernel': ['rbf'], 'gamma': [0, .1, .05, .01, 1e-3, 1e-4, 1e-5],
               'C': [.1, 1, 10, 100, 1000], 'class_weight': ['auto']},
              {'kernel': ['linear'], 'C': [1, 10, 100, 1000],
               'class_weight': ['auto']},
              {'kernel': ['poly'], 'C': [1, 10, 100, 1000],
               'class_weight': ['auto']}]
    
    clf = GridSearchCV(SVC(C=1), params, score_func=auc_score)
    clf.fit(X_train, y_train, cv=5);
    best = clf.best_estimator_
    auc = clf.score(X, Y)
    
    mat_all = gg.ix[mat.index].T.as_matrix()
    inferred = best.predict(mat_all)
    inferred = pd.Series(inferred, index=gg.columns)
    fun = pd.Series(best.decision_function(mat_all)[:, 0], mat.columns)
    f = feature.copy()
    f = f.ix[inferred.index]
    f[f.isnull()] = inferred[f.isnull()]
    filled_feature = f.astype(float)
    return {'auc': auc, 'model': best, 'decision_function': fun,
            'inferred_values': inferred, 'filled_feature': filled_feature} 
예제 #5
0
def process_feature(feature, q, std):
    if (get_vec_type(feature) == 'real') and (len(feature.unique()) > 10):
        feature = to_quants(feature, q=q, std=std, labels=True)
    return feature