def rec_feature_elim(data,num_features=17700): X = data.get_gene_exp_matrix() y = data.get_labels() svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features_to_select=num_features, step=1) selector = rfe.fit(X, y) mask = map(lambda x: 1 if x is True else 0,selector.support_) print_genes_nonzero_coeff(data,mask)
def kFoldGetSparsity(data,logregAlgo,k=4): print "--------------------------------------------" X = data.get_gene_exp_matrix() Y = data.get_labels() kf = cross_validation.KFold(len(X), k=k,shuffle=True) sparsity = [] for train_index, test_index in kf: X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index] y_train, y_test = [Y[i] for i in train_index], [Y[i] for i in test_index] logregAlgo.fit(X_train,y_train) coeffs = logregAlgo.coef_.ravel() print print_genes_nonzero_coeff(data,coeffs) sparsity.append(np.mean(coeffs==0)*100) return(sparsity)
def rec_feature_elim_with_KFold(data): """Recursive feature elimination FIXME: How to pick a kernel? WARNING: ridiculously slow? """ X = data.get_gene_exp_matrix() y = data.get_labels() # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2),loss_func=zero_one) selector = rfecv.fit(X, y) mask = map(lambda x: 1 if x is True else 0,selector.support_) print_genes_nonzero_coeff(data,mask) print "Optimal number of features : %d" % rfecv.n_features_