def classify(title, X, y, keys): print 'classify(title=%s, X=%s, y=%s, keys=%s)' % (title, X.shape, y.shape, keys) Xr, yr = select_features.resample_equal_y(X, y, 1.0) n_iter_val = 500 power_t_val = 0.9 alpha_val = 0.1 def get_sgd_hinge(): return SGDClassifier(loss="hinge", alpha=alpha_val, n_iter=n_iter_val, fit_intercept=True) def get_rbf_svc(): return svm.SVC(kernel='rbf', C=0.5, gamma=0.1) return classify_by_method(title + '_rbf', Xr, yr, keys, get_rbf_svc, True)
def compare_classifiers(title, X, y, keys): print 'compare_classifiers(title=%s, X=%s, y=%s, keys=%s)' % (title, X.shape, y.shape, keys) Xr, yr = select_features.resample_equal_y(X, y, 1.0) n_iter_val = 5000 power_t_val = 0.9 alpha_val = 0.1 CACHE_SIZE = 2000 def get_sgd_hinge(): return SGDClassifier(loss="hinge", alpha=alpha_val, n_iter=n_iter_val, fit_intercept=True) def get_svd_linear(): return svm.SVC(kernel='linear') def get_svd_poly(): return svm.SVC(kernel='poly') def get_nu_linear(): return svm.NuSVC(kernel='linear') def get_rbf_svc(): return svm.SVC(kernel='rbf', C=0.5, gamma=0.1) def get_linear_svc(): return svm.LinearSVC() def get_bayes_ridge(): return linear_model.BayesianRidge() def get_log_reg_l1(): return linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) def get_log_reg_l2(): return linear_model.LogisticRegression(C=1.0, penalty='l2', tol=1e-6) def get_lars(): return linear_model.LassoLars(alpha = 0.1) def get_lasso(): return linear_model.Lasso(alpha = 0.1) def get_ridge(): return linear_model.Ridge (alpha = 0.5) # get_rbf_svc works best followed by get_log_reg_l* classifiers = { 'sgd_hinge': get_sgd_hinge, 'svd_linear': get_svd_linear, 'svd_poly': get_svd_poly, 'nu_linear': get_nu_linear, 'linear_svc': get_linear_svc, 'rbf_svc': get_rbf_svc, #'bayes_ridge': get_bayes_ridge, Not a classifier 'log_reg_l1': get_log_reg_l1, 'log_reg_l2': get_log_reg_l2, 'lars': get_lars, 'lasso': get_lasso, #'ridge': get_ridge, } slow = ['svd_poly'] classifier_order = sorted(classifiers.keys(), key = lambda k: (k in slow, k)) #print svm.SVC.__doc__ if False: for gamma in [0.0, 0.1, 0.2, 0.5]: for C in [0.1, 0.2, 0.5, 1.0]: def func(): return svm.SVC(kernel='rbf', C=C, gamma=gamma) #return svm.SVC(kernel='rbf', cache_size=CACHE_SIZE, C=C, gamma=gamma) name = '%s_gamma=%.2f_C=%.2f' % (title, gamma, C) classify_by_method(name, Xr, yr, keys, func, False) for name in classifier_order: func = classifiers[name] classify_by_method(title + '_' + name, Xr, yr, keys, func, False)
def classify_old(title, X, y, keys, get_classifier): print 'classify(title=%s, X=%s, y=%s, keys=%s)' % (title, X.shape, y.shape, keys) Xr, yr = select_features.resample_equal_y(X, y, 1.0) print 'classify: Xr=%s, yr=%s' % (Xr.shape, yr.shape) n_samples = Xr.shape[0] if False: X_train, y_train = Xr[:n_samples/2,:], yr[:n_samples/2] X_test, y_test = Xr[n_samples/2:,:], yr[n_samples/2:] NUM_FOLDS = 5 skf = StratifiedKFold(yr, NUM_FOLDS) verbose = False if verbose: def P(s): print s else: def P(s): pass n_iter_val = 500 for power_t_val in [0.9]: for alpha_val in [0.1]: y_test_all = np.zeros(0) y_pred_all = np.zeros(0) for i,(train, test) in enumerate(skf): X_train, y_train = Xr[train,:], yr[train] X_test, y_test = Xr[test,:], yr[test] if verbose: common.SUBHEADING() P('Fold %d of %d' % (i, NUM_FOLDS)) P('classify: X_train=%s, y_train=%s' % (X_train.shape, y_train.shape)) P('classify: X_test=%s, y_test=%s' % (X_test.shape, y_test.shape)) # fit the model classifier = SGDClassifier(loss="hinge", alpha=alpha_val, n_iter=n_iter_val, fit_intercept=True) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) P('Classification report for classifier %s:\n%s\n' % (classifier, metrics.classification_report(y_test, y_pred))) P('Confusion matrix:\n%s' % metrics.confusion_matrix(y_test, y_pred)) y_test_all = np.r_[y_test_all, y_test] y_pred_all = np.r_[y_pred_all, y_pred] common.HEADING() print 'Classification report for all %s:\n%s\n' % ( classifier, metrics.classification_report(y_test_all, y_pred_all)) print 'Confusion matrix:\n%s' % metrics.confusion_matrix(y_test_all, y_pred_all) # plot the line, the points, and the nearest vectors to the plane if False: fac = 1.0 print 'Downsampling by a further factor of %f' % fac X_r, y_r = sklearn.utils.resample(X, y, n_samples = int(X.shape[0] * fac)) y_pred = classifier.predict(Xr) plot_classification(Xr, yr, y_pred, keys, title, classifier)
top_features['f'][AGE_HIGH] = ['DrugCount_DSFS', 'proc_group=SDS', 'specialty=None', 'pcg=MISCL5', 'pcg=NEUMENT', 'pcg=ODaBNCA', 'pcg=SKNAUT', 'pcg=TRAUMA'] # Set random seed so that each run gives same results random.seed(333) np.random.seed(333) def P(s): """Print string s""" print s #logfile.write(s + '\n') features = 'all2' X,y,keys = getXy_by_features_(-1, features) Xr, yr = select_features.resample_equal_y(X, y, 1.0) Xr, yr = normalize(Xr, yr) sex_vals = np.unique(Xr[:,keys.index('Sex')]) age_vals = np.unique(Xr[:,keys.index('AgeAtFirstClaim')]) sex_boundary = sex_vals.mean() age_boundaries = [0.5*(age_vals[i]+age_vals[i+1]) for i in [0,age_vals.size-2]] print 'sex_vals = %s' % sex_vals print 'age_vals = %s' % age_vals print 'sex_boundary = %s' % sex_boundary print 'age_boundaries = %s' % age_boundaries print 'Xr=%s,yr=%s' % (Xr.shape, yr.shape) NUM_FOLDS = 2 skf = StratifiedKFold(yr, NUM_FOLDS)