def test_get_k_fold_partition(self): num_sample = 100 x, y = [], [] for i in range(0, num_sample): features = [ np.random.randint(11, 50), np.random.randint(11, 50), np.random.randint(11, 50) ] target = np.random.randint(0, 10) x.append(features) y.append(target) assert len(x) == num_sample assert len(y) == num_sample k = 10 perm_indices = np.random.permutation(num_sample) test_partitions = [] for k_idx in range(0, k): x_train, y_train, x_val, y_val, x_test, y_test = \ emr.get_k_fold_partition(x, y, k_idx, k, perm_indices) # check sum of lengths assert num_sample == len(x_train) + len(x_val) + len(x_test) assert num_sample == len(y_train) + len(y_val) + len(y_test) # check values are equivalent assert sorted(x) == sorted(x_train + x_val + x_test) assert sorted(y) == sorted(y_train + y_val + y_test) test_partitions.extend(x_test) assert len(test_partitions) == num_sample assert sorted(test_partitions) == sorted(x) # check full test coverage
def run(data_fn, method='lrfc', which_half='both', prop_missing=0.0, k=10, skip_nonlinear_svm=False, nb_searches=20): data_path = '{}/{}'.format(DATA_DIR, data_fn) def get_results_dir(method, k_idx): base_folder = 'out/more/{}_{}_{}'.format(method, data_fn, prop_missing) folder = '{}/{}_idx_partition'.format(base_folder, k_idx) if not os.path.exists('out'): os.makedirs('out') if not os.path.exists('out/more'): os.makedirs('out/models') if not os.path.exists(base_folder): os.makedirs(base_folder) if not os.path.exists(folder): os.makedirs(folder) return folder try: # load saved parameters get_param_fn = lambda x: '{}/{}_{}_{}_param.pkl'.format( CACHE_DIR, x, data_fn, prop_missing) if method == 'lrfc': with open(get_param_fn('logit'), 'r') as f: logit_params = pickle.load(f) with open(get_param_fn('rfc'), 'r') as f: rfc_params = pickle.load(f) elif method == 'svm': with open(get_param_fn('linear-svm'), 'r') as f: linear_svm_params = pickle.load(f) if not skip_nonlinear_svm: with open(get_param_fn('poly-svm'), 'r') as f: poly_svm_params = pickle.load(f) with open(get_param_fn('rbf-svm'), 'r') as f: rbf_svm_params = pickle.load(f) else: raise ValueError('unknown method: {}'.format(method)) except: eprint('Need to do parameter search!') eprint('Please run `parameter_search.py` with the relevant' + 'command line arguments') raise X, y, perm_indices, nb_features, nb_classes = get_base_data( data_path, prop_missing) losses = { 'logit': [], 'rfc': [], 'linear-svm': [], 'poly-svm': [], 'rbf-svm': [] } accs = { 'logit': [], 'rfc': [], 'linear-svm': [], 'poly-svm': [], 'rbf-svm': [] } runtimes = { 'logit': [], 'rfc': [], 'linear-svm': [], 'poly-svm': [], 'rbf-svm': [] } if which_half == 'first': loop_seq = range(0, k / 2) elif which_half == 'last': loop_seq = range(k / 2, k) elif which_half == 'both': loop_seq = range(0, k) else: raise ValueError( '`which_half` must be \'first\', \'last\' or \'both\'') for k_idx in loop_seq: print('-' * 72) print('Partition k = {}'.format(k_idx)) data_partition_dict = emr.get_k_fold_partition( X, y, k_idx=k_idx, k=k, perm_indices=perm_indices) X_train = data_partition_dict['X_train'] y_train = data_partition_dict['y_train'] X_val = data_partition_dict['X_val'] y_val = data_partition_dict['y_val'] X_test = data_partition_dict['X_test'] y_test = data_partition_dict['y_test'] selected_feat_indices = select_feats(X_train + X_val, y_train + y_val, nb_features=nb_features) X_train, y_train = preproc_for_sklearn(X_train, y_train, nb_features) X_test, y_test = preproc_for_sklearn(X_test, y_test, nb_features) old_nb_features = len(X_train[0]) X_train = X_train[:, selected_feat_indices] X_test = X_test[:, selected_feat_indices] nb_features = len(X_train[0]) # extraneous but for future utility print('Reduced features from {} to {}'.format(old_nb_features, nb_features)) if method == 'lrfc': from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier # logistic regression start = time.time() logit = LogisticRegression(multi_class='multinomial', solver='lbfgs', **logit_params[k_idx]) logit.fit(X_train, y_train) logit_acc = accuracy_score(y_test, logit.predict(X_test)) logit_y_test_proba = logit.predict_proba(X_test) logit_loss = log_loss(y_test, logit_y_test_proba) logit_time = time.time() - start print( 'Logistic regression / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s' .format(logit_loss, logit_acc, logit_time)) # random forest classifier start = time.time() rfc = RandomForestClassifier(**rfc_params[k_idx]) rfc.fit(X_train, y_train) rfc_acc = accuracy_score(y_test, rfc.predict(X_test)) rfc_y_test_proba = rfc.predict_proba(X_test) rfc_loss = log_loss(y_test, rfc_y_test_proba) rfc_time = time.time() - start print( 'Random forest / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s' .format(rfc_loss, rfc_acc, rfc_time)) save_test_results( logit_y_test_proba, y_test, '{}/test_results.txt'.format(get_results_dir('logit', k_idx))) save_test_results( rfc_y_test_proba, y_test, '{}/test_results.txt'.format(get_results_dir('rfc', k_idx))) # joblib.dump(logit, get_results_dir('logit', k_idx) + '/clf.pkl') # joblib.dump(rfc, get_results_dir('rfc', k_idx) + '/clf.pkl') losses['logit'].append(logit_loss) accs['logit'].append(logit_acc) runtimes['logit'].append(logit_time) losses['rfc'].append(rfc_loss) accs['rfc'].append(rfc_acc) runtimes['rfc'].append(rfc_time) elif method == 'svm': from sklearn.svm import SVC # linear SVM start = time.time() linear_svm = SVC(kernel='linear', probability=True, **linear_svm_params[k_idx]) linear_svm.fit(X_train, y_train) linear_svm_acc = accuracy_score(y_test, linear_svm.predict(X_test)) linear_svm_y_test_proba = linear_svm.predict_proba(X_test) linear_svm_loss = log_loss(y_test, linear_svm_y_test_proba) linear_svm_time = time.time() - start print( 'Linear SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s' .format(linear_svm_acc, linear_svm_loss, linear_svm_time)) save_test_results( linear_svm_y_test_proba, y_test, '{}/test_results.txt'.format( get_results_dir('linear-svm', k_idx))) # joblib.dump(linear_svm, get_results_dir('linear-svm', k_idx) + '/clf.pkl') losses['linear-svm'].append(linear_svm_loss) accs['linear-svm'].append(linear_svm_acc) runtimes['linear-svm'].append(linear_svm_time) if skip_nonlinear_svm: continue # skip # polynomial SVM start = time.time() poly_svm = SVC(kernel='poly', probability=True, **poly_svm_params[k_idx]) poly_svm.fit(X_train, y_train) poly_svm_acc = accuracy_score(y_test, poly_svm.predict(X_test)) poly_svm_y_test_proba = poly_svm.predict_proba(X_test) poly_svm_loss = log_loss(y_test, poly_svm_y_test_proba) poly_svm_time = time.time() - start print( 'Polynomial SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s' .format(poly_svm_acc, poly_svm_loss, poly_svm_time)) # RBF SVM start = time.time() rbf_svm = SVC(kernel='rbf', probability=True, **rbf_svm_params[k_idx]) rbf_svm.fit(X_train, y_train) rbf_svm_acc = accuracy_score(y_test, rbf_svm.predict(X_test)) rbf_svm_y_test_proba = rbf_svm.predict_proba(X_test) rbf_svm_loss = log_loss(y_test, rbf_svm_y_test_proba) rbf_svm_time = time.time() - start print('RBF SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s'. format(rbf_svm_acc, rbf_svm_loss, rbf_svm_time)) save_test_results( poly_svm_y_test_proba, y_test, '{}/test_results.txt'.format( get_results_dir('poly-svm', k_idx))) save_test_results( rbf_svm_y_test_proba, y_test, '{}/test_results.txt'.format(get_results_dir('rbf-svm', k_idx))) # joblib.dump(poly_svm, get_results_dir('poly-svm', k_idx) + '/clf.pkl') # joblib.dump(rbf_svm, get_results_dir('rbf-svm', k_idx) + '/clf.pkl') losses['poly-svm'].append(poly_svm_loss) accs['poly-svm'].append(poly_svm_acc) runtimes['poly-svm'].append(poly_svm_time) losses['rbf-svm'].append(rbf_svm_loss) accs['rbf-svm'].append(rbf_svm_acc) runtimes['rbf-svm'].append(rbf_svm_time) else: raise ValueError('unknown method: {}'.format(method)) print() print('#' * 72) if method == 'lrfc': print_metrics(losses['logit'], accs['logit'], runtimes['logit'], 'Logistic regression') print_metrics(losses['rfc'], accs['rfc'], runtimes['rfc'], 'Random forest') elif method == 'svm': print_metrics(losses['linear-svm'], accs['linear-svm'], runtimes['linear-svm'], 'Linear SVM') if not skip_nonlinear_svm: print_metrics(losses['poly-svm'], accs['poly-svm'], runtimes['poly-svm'], 'Polynomial SVM') print_metrics(losses['rbf-svm'], accs['rbf-svm'], runtimes['rbf-svm'], 'RBF SVM') else: raise ValueError('unknown method: {}'.format(method)) print('#' * 72)
def run(data_fn, prop_missing=0., max_num_feature=-1, feature_selection='random', k=10, data_dir='_data', out_dir='_out'): """Run RIDDLE classification interpretation pipeline. Arguments: data_fn: string data file filename prop_missing: float proportion of feature observations which should be randomly masked; values in [0, 1) max_num_feature: int maximum number of features to use feature_selection: string feature selection method; values = {'random', 'frequency', 'chi2'} k: int number of partitions for k-fold cross-validation interpret_model: bool whether to interpret the trained model for first k-fold partition which_half: str which half of experiments to do; values = {'first', 'last', 'both'} data_dir: string directory where data files are located cache_dir: string directory where cached files (e.g., saved parameters) are located out_dir: string outer directory where outputs (e.g., results) should be saved """ from keras.models import load_model from riddle import emr, feature_importance from riddle.models import MLP start = time.time() base_out_dir = get_base_out_dir(out_dir, 'riddle', data_fn, prop_missing, max_num_feature, feature_selection) recursive_mkdir(base_out_dir) # get common data x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict, perm_indices = ( get_preprocessed_data(data_dir, data_fn, prop_missing=prop_missing)) num_feature = len(idx_feat_dict) num_class = len(idx_class_dict) list_sums_D, list_sums_D2, list_sums_contribs = [], [], [] for k_idx in range(k): full_out_dir = '{}/k_idx={}'.format(base_out_dir, k_idx) print('\nPartition k = {}'.format(k_idx)) x_train_unvec, y_train, _, _, x_test_unvec, y_test = emr.get_k_fold_partition( x_unvec, y, k_idx=k_idx, k=k, perm_indices=perm_indices) if max_num_feature > 0: # select features and re-encode feat_encoding_dict, idx_feat_dict = select_features( x_train_unvec, y_train, idx_feat_dict, method=feature_selection, num_feature=num_feature, max_num_feature=max_num_feature) x_test_unvec = subset_reencode_features(x_test_unvec, feat_encoding_dict) num_feature = max_num_feature # interpret start = time.time() temp_mlp = MLP(num_feature=num_feature, num_class=num_class) hdf5_path = full_out_dir + '/model.h5' sums_D, sums_D2, sums_contribs, pairs = \ feature_importance.get_diff_sums( hdf5_path, x_test_unvec, process_x_func=temp_mlp.process_x, num_feature=num_feature, num_class=num_class) with open(full_out_dir + '/sums_D.pkl', 'wb') as f: pickle.dump(sums_D, f) with open(full_out_dir + '/sums_D2.pkl', 'wb') as f: pickle.dump(sums_D2, f) with open(full_out_dir + '/sums_contribs.pkl', 'wb') as f: pickle.dump(sums_contribs, f) list_sums_D.append(sums_D) list_sums_D2.append(sums_D2) list_sums_contribs.append(sums_contribs) def compute_total_sums(list_sums): total_sums = list_sums[0] for i in range(1, len(list_sums)): for j in range(len(total_sums)): total_sums[j] = np.add(total_sums[j], list_sums[i][j]) return total_sums total_sums_D = compute_total_sums(list_sums_D) total_sums_D2 = compute_total_sums(list_sums_D2) total_sums_contribs = compute_total_sums(list_sums_contribs) num_sample = len(x_unvec) run_interpretation_summary(x_unvec, y, total_sums_D, total_sums_D2, total_sums_contribs, idx_feat_dict=idx_feat_dict, idx_class_dict=idx_class_dict, icd9_descript_dict=icd9_descript_dict, pairs=pairs, num_sample=num_sample, full_out_dir=base_out_dir) print('Computed DeepLIFT scores and analysis in {:.4f} seconds'.format( time.time() - start)) print('-' * 72) print()
def run(method, x_unvec, y, idx_feat_dict, num_feature, max_num_feature, num_class, max_num_sample, feature_selection, k_idx, k, num_search, perm_indices): """Run a parameter search for a single k-fold partitions Arguments: method: string name of classification method; values = {'logit', 'random_forest', 'linear_svm', 'poly_svm', 'rbf_svm', 'gbdt', 'riddle'} x_unvec: [[int]] feature indices that have not been vectorized; each inner list collects the indices of features that are present (binary on) for a sample y: [int] list of class labels as integer indices idx_feat_dict: {int: string} dictionary mapping feature indices to features num_feature: int number of features present in the dataset max_num_feature: int maximum number of features to use num_class: int number of classes present feature_selection: string feature selection method; values = {'random', 'frequency', 'chi2'} k_idx: int index of the k-fold partition to use k: int number of partitions for k-fold cross-validation num_search: int number of searches (parameter configurations) to try perm_indices: np.ndarray, int array of indices representing a permutation of the samples with shape (num_sample, ) Returns: best_param: {string: ?} dictionary mapping parameter names to the best values found """ print('-' * 72) print('Partition k = {}'.format(k_idx)) x_train_unvec, y_train, x_val_unvec, y_val, _, _ = ( emr.get_k_fold_partition(x_unvec, y, k_idx=k_idx, k=k, perm_indices=perm_indices)) if max_num_feature > 0: # select features and re-encode feat_encoding_dict, _ = select_features( x_train_unvec, y_train, idx_feat_dict, method=feature_selection, num_feature=num_feature, max_num_feature=max_num_feature) x_val_unvec = subset_reencode_features(x_val_unvec, feat_encoding_dict) num_feature = max_num_feature # cap number of validation samples if max_num_sample != None and len(x_val_unvec) > max_num_sample: x_val_unvec = x_val_unvec[0:max_num_sample] y_val = y_val[0:max_num_sample] start = time.time() if method == 'riddle': model_class = MLP init_args = {'num_feature': num_feature, 'num_class': num_class} param_dist = { 'num_hidden_layer': 2, # [1, 2] 'num_hidden_node': 512, # [128, 256, 512] 'activation': ['prelu', 'relu'], 'dropout': tuning.Uniform(lo=0.2, hi=0.8), 'learning_rate': tuning.UniformLogSpace(10, lo=-6, hi=-1), } best_param = tuning.random_search(model_class, init_args, param_dist, x_val_unvec, y_val, num_class=num_class, k=TUNING_K, num_search=num_search) else: # scikit-learn methods x_val = vectorize_features(x_val_unvec, num_feature) if method == 'logit': # logistic regression from sklearn.linear_model import LogisticRegression estimator = LogisticRegression(multi_class='multinomial', solver='lbfgs') param_dist = {'C': tuning.UniformLogSpace(base=10, lo=-3, hi=3)} elif method == 'random_forest': from sklearn.ensemble import RandomForestClassifier estimator = RandomForestClassifier() param_dist = { 'max_features': ['sqrt', 'log2', None], 'max_depth': tuning.UniformIntegerLogSpace(base=2, lo=0, hi=7), 'n_estimators': tuning.UniformIntegerLogSpace(base=2, lo=4, hi=8) } elif method == 'linear_svm': from sklearn.svm import SVC # remark: due to a bug in scikit-learn / libsvm, the sparse 'linear' # kernel is much slower than the sparse 'poly' kernel, so we use # the 'poly' kernel with degree=1 over the 'linear' kernel estimator = SVC(kernel='poly', degree=1, coef0=0., gamma=1., probability=True, cache_size=1000) param_dist = {'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1)} elif method == 'poly_svm': from sklearn.svm import SVC estimator = SVC(kernel='poly', probability=True, cache_size=1000) param_dist = { 'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1), 'degree': [2, 3, 4], 'gamma': tuning.UniformLogSpace(base=10, lo=-5, hi=1) } elif method == 'rbf_svm': from sklearn.svm import SVC estimator = SVC(kernel='rbf', probability=True, cache_size=1000) param_dist = { 'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1), 'gamma': tuning.UniformLogSpace(base=10, lo=-5, hi=1) } elif method == 'gbdt': from xgboost import XGBClassifier estimator = XGBClassifier(objective='multi:softprob') param_dist = { 'max_depth': tuning.UniformIntegerLogSpace(base=2, lo=0, hi=5), 'n_estimators': tuning.UniformIntegerLogSpace(base=2, lo=4, hi=8), 'learning_rate': tuning.UniformLogSpace(base=10, lo=-3, hi=0) } else: raise ValueError('unknown method: {}'.format(method)) param_search = RandomizedSearchCV(estimator, param_dist, refit=False, n_iter=num_search, scoring=loss_scorer) param_search.fit(x_val, y_val) best_param = param_search.best_params_ print('Best parameters for {} for k_idx={}: {} found in {:.3f} s'.format( method, k_idx, best_param, time.time() - start)) return best_param
def run(data_fn, method='lrfc', which_half='both', prop_missing=0.0, k=10, skip_nonlinear_svm=False, nb_searches=20): data_path = '{}/{}'.format(DATA_DIR, data_fn) def get_results_dir(method, k_idx): base_folder = 'out/more/{}_{}_{}'.format(method, data_fn, prop_missing) folder = '{}/{}_idx_partition'.format(base_folder, k_idx) if not os.path.exists('out'): os.makedirs('out') if not os.path.exists('out/more'): os.makedirs('out/models') if not os.path.exists(base_folder): os.makedirs(base_folder) if not os.path.exists(folder): os.makedirs(folder) return folder try: # load saved parameters get_param_fn = lambda x: '{}/{}_{}_{}_param.pkl'.format(CACHE_DIR, x, data_fn, prop_missing) if method == 'lrfc': with open(get_param_fn('logit'), 'r') as f: logit_params = pickle.load(f) with open(get_param_fn('rfc'), 'r') as f: rfc_params = pickle.load(f) elif method == 'svm': with open(get_param_fn('linear-svm'), 'r') as f: linear_svm_params = pickle.load(f) if not skip_nonlinear_svm: with open(get_param_fn('poly-svm'), 'r') as f: poly_svm_params = pickle.load(f) with open(get_param_fn('rbf-svm'), 'r') as f: rbf_svm_params = pickle.load(f) else: raise ValueError('unknown method: {}'.format(method)) except: eprint('Need to do parameter search!') eprint('Please run `parameter_search.py` with the relevant' + 'command line arguments') raise X, y, perm_indices, nb_features, nb_classes = get_base_data(data_path, prop_missing) losses = {'logit':[], 'rfc':[], 'linear-svm':[], 'poly-svm':[], 'rbf-svm':[]} accs = {'logit':[], 'rfc':[], 'linear-svm':[], 'poly-svm':[], 'rbf-svm':[]} runtimes = {'logit':[], 'rfc':[], 'linear-svm':[], 'poly-svm':[], 'rbf-svm':[]} if which_half == 'first': loop_seq = range(0, k / 2) elif which_half == 'last': loop_seq = range(k / 2, k) elif which_half == 'both': loop_seq = range(0, k) else: raise ValueError('`which_half` must be \'first\', \'last\' or \'both\'') for k_idx in loop_seq: print('-' * 72) print('Partition k = {}'.format(k_idx)) data_partition_dict = emr.get_k_fold_partition(X, y, k_idx=k_idx, k=k, perm_indices=perm_indices) X_train = data_partition_dict['X_train'] y_train = data_partition_dict['y_train'] X_val = data_partition_dict['X_val'] y_val = data_partition_dict['y_val'] X_test = data_partition_dict['X_test'] y_test = data_partition_dict['y_test'] selected_feat_indices = select_feats(X_train + X_val, y_train + y_val, nb_features=nb_features) X_train, y_train = preproc_for_sklearn(X_train, y_train, nb_features) X_test, y_test = preproc_for_sklearn(X_test, y_test, nb_features) old_nb_features = len(X_train[0]) X_train = X_train[:, selected_feat_indices] X_test = X_test[:, selected_feat_indices] nb_features = len(X_train[0]) # extraneous but for future utility print('Reduced features from {} to {}'.format(old_nb_features, nb_features)) if method == 'lrfc': from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier # logistic regression start = time.time() logit = LogisticRegression(multi_class='multinomial', solver='lbfgs', **logit_params[k_idx]) logit.fit(X_train, y_train) logit_acc = accuracy_score(y_test, logit.predict(X_test)) logit_y_test_probas = logit.predict_proba(X_test) logit_loss = log_loss(y_test, logit_y_test_probas) logit_time = time.time() - start print('Logistic regression / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s' .format(logit_loss, logit_acc, logit_time)) # random forest classifier start = time.time() rfc = RandomForestClassifier(**rfc_params[k_idx]) rfc.fit(X_train, y_train) rfc_acc = accuracy_score(y_test, rfc.predict(X_test)) rfc_y_test_probas = rfc.predict_proba(X_test) rfc_loss = log_loss(y_test, rfc_y_test_probas) rfc_time = time.time() - start print('Random forest / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s' .format(rfc_loss, rfc_acc, rfc_time)) save_test_results(logit_y_test_probas, y_test, '{}/test_results.txt'.format(get_results_dir('logit', k_idx))) save_test_results(rfc_y_test_probas, y_test, '{}/test_results.txt'.format(get_results_dir('rfc', k_idx))) # joblib.dump(logit, get_results_dir('logit', k_idx) + '/clf.pkl') # joblib.dump(rfc, get_results_dir('rfc', k_idx) + '/clf.pkl') losses['logit'].append(logit_loss) accs['logit'].append(logit_acc) runtimes['logit'].append(logit_time) losses['rfc'].append(rfc_loss) accs['rfc'].append(rfc_acc) runtimes['rfc'].append(rfc_time) elif method == 'svm': from sklearn.svm import SVC # linear SVM start = time.time() linear_svm = SVC(kernel='linear', probability=True, **linear_svm_params[k_idx]) linear_svm.fit(X_train, y_train) linear_svm_acc = accuracy_score(y_test, linear_svm.predict(X_test)) linear_svm_y_test_probas = linear_svm.predict_proba(X_test) linear_svm_loss = log_loss(y_test, linear_svm_y_test_probas) linear_svm_time = time.time() - start print('Linear SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s' .format(linear_svm_acc, linear_svm_loss, linear_svm_time)) save_test_results(linear_svm_y_test_probas, y_test, '{}/test_results.txt'.format(get_results_dir('linear-svm', k_idx))) # joblib.dump(linear_svm, get_results_dir('linear-svm', k_idx) + '/clf.pkl') losses['linear-svm'].append(linear_svm_loss) accs['linear-svm'].append(linear_svm_acc) runtimes['linear-svm'].append(linear_svm_time) if skip_nonlinear_svm: continue # skip # polynomial SVM start = time.time() poly_svm = SVC(kernel='poly', probability=True, **poly_svm_params[k_idx]) poly_svm.fit(X_train, y_train) poly_svm_acc = accuracy_score(y_test, poly_svm.predict(X_test)) poly_svm_y_test_probas = poly_svm.predict_proba(X_test) poly_svm_loss = log_loss(y_test, poly_svm_y_test_probas) poly_svm_time = time.time() - start print('Polynomial SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s' .format(poly_svm_acc, poly_svm_loss, poly_svm_time)) # RBF SVM start = time.time() rbf_svm = SVC(kernel='rbf', probability=True, **rbf_svm_params[k_idx]) rbf_svm.fit(X_train, y_train) rbf_svm_acc = accuracy_score(y_test, rbf_svm.predict(X_test)) rbf_svm_y_test_probas = rbf_svm.predict_proba(X_test) rbf_svm_loss = log_loss(y_test, rbf_svm_y_test_probas) rbf_svm_time = time.time() - start print('RBF SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s' .format(rbf_svm_acc, rbf_svm_loss, rbf_svm_time)) save_test_results(poly_svm_y_test_probas, y_test, '{}/test_results.txt'.format(get_results_dir('poly-svm', k_idx))) save_test_results(rbf_svm_y_test_probas, y_test, '{}/test_results.txt'.format(get_results_dir('rbf-svm', k_idx))) # joblib.dump(poly_svm, get_results_dir('poly-svm', k_idx) + '/clf.pkl') # joblib.dump(rbf_svm, get_results_dir('rbf-svm', k_idx) + '/clf.pkl') losses['poly-svm'].append(poly_svm_loss) accs['poly-svm'].append(poly_svm_acc) runtimes['poly-svm'].append(poly_svm_time) losses['rbf-svm'].append(rbf_svm_loss) accs['rbf-svm'].append(rbf_svm_acc) runtimes['rbf-svm'].append(rbf_svm_time) else: raise ValueError('unknown method: {}'.format(method)) print() print('#' * 72) if method == 'lrfc': print_metrics(losses['logit'], accs['logit'], runtimes['logit'], 'Logistic regression') print_metrics(losses['rfc'], accs['rfc'], runtimes['rfc'], 'Random forest') elif method == 'svm': print_metrics(losses['linear-svm'], accs['linear-svm'], runtimes['linear-svm'], 'Linear SVM') if not skip_nonlinear_svm: print_metrics(losses['poly-svm'], accs['poly-svm'], runtimes['poly-svm'], 'Polynomial SVM') print_metrics(losses['rbf-svm'], accs['rbf-svm'], runtimes['rbf-svm'], 'RBF SVM') else: raise ValueError('unknown method: {}'.format(method)) print('#' * 72)
def run(data_fn, method='lrfc', prop_missing=0.0, k=10, skip_nonlinear_svm=False, nb_searches=20, max_nb_samples=10000): if 'dummy' in data_fn or 'debug' in data_fn: nb_searches = 3 data_path = '{}/{}'.format(DATA_DIR, data_fn) if not FORCE_RUN: # check if already did param search, if so, skip did = lambda x: already_done(x, data_fn, prop_missing) # helper if method == 'riddle' and did(['riddle']): eprint('Already did parameter search for riddle') return elif method == 'lrfc' and did(['logit', 'rfc']): eprint('Already did parameter search for lrfc') return elif method == 'svm' and did(['linear-svm', 'poly-svm', 'rbf-svm']): eprint('Already did parameter search for svm') return params = { 'riddle': {}, 'logit': {}, 'rfc': {}, 'linear-svm': {}, 'poly-svm': {}, 'rbf-svm': {} } X, y, perm_indices, nb_features, nb_classes = get_base_data( data_path, prop_missing) for k_idx in range(0, k): print('-' * 72) print('Partition k = {}'.format(k_idx)) data_partition_dict = emr.get_k_fold_partition( X, y, k_idx=k_idx, k=k, perm_indices=perm_indices) X_train = data_partition_dict['X_train'] y_train = data_partition_dict['y_train'] X_val = data_partition_dict['X_val'] y_val = data_partition_dict['y_val'] # cap number of validation samples if max_nb_samples != None and len(X_val) > max_nb_samples: X_val = X_val[0:max_nb_samples] y_val = y_val[0:max_nb_samples] if method != 'riddle': selected_feat_indices = select_feats(X_train + X_val, y_train + y_val, nb_features=nb_features) X_val, y_val = preproc_for_sklearn(X_val, y_val, nb_features=nb_features) X_val = X_val[:, selected_feat_indices] if method == 'riddle': start = time.time() model_module = models.deep_mlp riddle_param_dist = { 'learning_rate': UniformLogSpace(10, lo=-6, hi=-1) } params['riddle'][k_idx] = parameter_tuning.random_search( model_module, riddle_param_dist, X_val, y_val, nb_features=nb_features, nb_classes=nb_classes, k=3, process_X_data_func_args={'nb_features': nb_features}, process_y_data_func_args={'nb_classes': nb_classes}, nb_searches=nb_searches) print('Best parameters for RIDDLE: {} found in {:.3f} s'.format( params['riddle'][k_idx], time.time() - start)) elif method == 'lrfc': # logistic regression start = time.time() logit_param_dist = {'C': UniformLogSpace()} logit_estimator = LogisticRegression(multi_class='multinomial', solver='lbfgs') params['logit'][k_idx] = parameter_search( X_val, y_val, estimator=logit_estimator, search=RandomizedSearchCV, dist_or_grid=logit_param_dist, n_iter=nb_searches, scoring=loss_scorer) print( 'Best parameters for logistic regression: {} found in {:.3f} s' .format(params['logit'][k_idx], time.time() - start)) # random forest classifier start = time.time() rfc_param_dist = {'max_features': ['sqrt', 'log2'], \ 'max_depth': UniformLogSpace(base=2, lo=2, hi=9)} rfc_estimator = RandomForestClassifier() params['rfc'][k_idx] = parameter_search( X_val, y_val, estimator=rfc_estimator, search=RandomizedSearchCV, dist_or_grid=rfc_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for random forest: {} found in {:.3f} s'. format(params['rfc'][k_idx], time.time() - start)) elif method == 'svm': # linear SVM start = time.time() linear_svm_param_dist = {'C': UniformLogSpace()} linear_svm_estimator = SVC(kernel='linear', probability=True) params['linear-svm'][k_idx] = parameter_search( X_val, y_val, estimator=linear_svm_estimator, search=RandomizedSearchCV, dist_or_grid=linear_svm_param_dist, n_iter=nb_searches, scoring=loss_scorer) print( 'Best parameters for linear SVM: {} found in {:.3f} s'.format( params['linear-svm'][k_idx], time.time() - start)) if skip_nonlinear_svm: continue # skip nonlinear_svm_param_dist = { 'C': UniformLogSpace(), 'gamma': UniformLogSpace(base=10, lo=-5, hi=1) } # polynomial SVM start = time.time() poly_svm_estimator = SVC(kernel='poly', probability=True) params['poly-svm'][k_idx] = parameter_search( X_val, y_val, estimator=poly_svm_estimator, search=RandomizedSearchCV, dist_or_grid=nonlinear_svm_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for polynomial SVM: {} found in {:.3f} s'. format(params['poly-svm'][k_idx], time.time() - start)) # RBF SVM start = time.time() rbf_svm_estimator = SVC(kernel='rbf', probability=True) params['rbf-svm'][k_idx] = parameter_search( X_val, y_val, estimator=rbf_svm_estimator, search=RandomizedSearchCV, dist_or_grid=nonlinear_svm_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for RBF SVM: {} found in {:.3f} s'.format( params['rbf-svm'][k_idx], time.time() - start)) else: raise ValueError('unknown method: {}'.format(method)) # save for method_name, sub_param_dict in params.items(): if len(sub_param_dict) > 0: pickle_object( sub_param_dict, '{}/{}_{}_{}_param.pkl'.format(CACHE_DIR, method_name, data_fn, prop_missing)) print('Finished parameter search for method: {}'.format(method))
def run(ModelClass, x_unvec, y, idx_feat_dict, num_feature, max_num_feature, num_class, feature_selection, k_idx, k, params, perm_indices, init_args, full_out_dir): """Run a classification pipeline for a single k-fold partition. Arguments: ModelClass: Python class classification model x_unvec: [[int]] feature indices that have not been vectorized; each inner list collects the indices of features that are present (binary on) for a sample y: [int] list of class labels as integer indices idx_feat_dict: {int: string} dictionary mapping feature indices to features num_feature: int number of features present in the dataset max_num_feature: int maximum number of features to use num_class: int number of classes present feature_selection: string feature selection method; values = {'random', 'frequency', 'chi2'} k_idx: int index of the k-fold partition to use k: int number of partitions for k-fold cross-validation params: [{string: ?}] list of dictionary mapping parameter names to values for each k-fold partition perm_indices: np.ndarray, int array of indices representing a permutation of the samples with shape (num_sample, ) init_args: {string: ?} dictionary mapping initialization argument names to values out_dir: string directory where outputs (e.g., results) should be saved """ print('-' * 72) print('Partition k = {}'.format(k_idx)) print(params[k_idx]) x_train_unvec, y_train, _, _, x_test_unvec, y_test = ( emr.get_k_fold_partition(x_unvec, y, k_idx=k_idx, k=k, perm_indices=perm_indices)) if max_num_feature > 0: # select features and re-encode feat_encoding_dict, _ = select_features( x_train_unvec, y_train, idx_feat_dict, method=feature_selection, num_feature=num_feature, max_num_feature=max_num_feature) x_train_unvec = subset_reencode_features(x_train_unvec, feat_encoding_dict) x_test_unvec = subset_reencode_features(x_test_unvec, feat_encoding_dict) num_feature = max_num_feature x_train = vectorize_features(x_train_unvec, num_feature) x_test = vectorize_features(x_test_unvec, num_feature) args = dict(init_args) # copy dictionary args.update(params[k_idx]) start = time.time() model = ModelClass(**args) model.fit(x_train, y_train) y_test_probas = model.predict_proba(x_test) runtime = time.time() - start evaluate(y_test, y_test_probas, runtime, num_class=num_class, out_dir=full_out_dir)
def run(x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict, num_feature, max_num_feature, num_class, feature_selection, k_idx, k, params, perm_indices, full_out_dir): """Run a RIDDLE classification pipeline for a single k-fold partition. Arguments: x_unvec: [[int]] feature indices that have not been vectorized; each inner list collects the indices of features that are present (binary on) for a sample y: [int] list of class labels as integer indices idx_feat_dict: {int: string} dictionary mapping feature indices to features idx_class_dict: {int: string} dictionary mapping class indices to classes icd9_descript_dict: {string: string} dictionary mapping ICD9 codes to description text num_feature: int number of features present in the dataset max_num_feature: int maximum number of features to use num_class: int number of classes feature_selection: string feature selection method; values = {'random', 'frequency', 'chi2'} k_idx: int index of the k-fold partition to use k: int number of partitions for k-fold cross-validation params: [{string: ?}] list of dictionary mapping parameter names to values for each k-fold partition perm_indices: np.ndarray, int array of indices representing a permutation of the samples with shape (num_sample, ) full_out_dir: string directory where outputs (e.g., results) should be saved """ from keras import backend as K from riddle import emr, feature_importance from riddle.models import MLP print('Partition k = {}'.format(k_idx)) print() x_train_unvec, y_train, x_val_unvec, y_val, x_test_unvec, y_test = ( emr.get_k_fold_partition(x_unvec, y, k_idx=k_idx, k=k, perm_indices=perm_indices)) if max_num_feature > 0: # select features and re-encode feat_encoding_dict, idx_feat_dict = select_features( x_train_unvec, y_train, idx_feat_dict, method=feature_selection, num_feature=num_feature, max_num_feature=max_num_feature) x_train_unvec = subset_reencode_features(x_train_unvec, feat_encoding_dict) x_val_unvec = subset_reencode_features(x_val_unvec, feat_encoding_dict) x_test_unvec = subset_reencode_features(x_test_unvec, feat_encoding_dict) num_feature = max_num_feature # set up max_num_epoch = -1 if 'debug' in full_out_dir: max_num_epoch = 3 model = MLP(num_feature=num_feature, num_class=num_class, max_num_epoch=max_num_epoch, **params[k_idx]) # train and test start = time.time() model.train(x_train_unvec, y_train, x_val_unvec, y_val) y_test_probas = model.predict_proba(x_test_unvec) runtime = time.time() - start print('Completed training and testing in {:.4f} seconds'.format(runtime)) print('-' * 72) print() # evaluate model performance evaluate(y_test, y_test_probas, runtime, num_class=num_class, out_dir=full_out_dir) model.save_model(path=full_out_dir + '/model.h5') K.clear_session() print('Finished with partition k = {}'.format(k_idx)) print('=' * 72) print()
def run(data_fn, method='lrfc', prop_missing=0.0, k=10, skip_nonlinear_svm=False, nb_searches=20, max_nb_samples=10000): if 'dummy' in data_fn or 'debug' in data_fn: nb_searches = 3 data_path = '{}/{}'.format(DATA_DIR, data_fn) if not FORCE_RUN: # check if already did param search, if so, skip did = lambda x: already_done(x, data_fn, prop_missing) # helper if method == 'riddle' and did(['riddle']): eprint('Already did parameter search for riddle') return elif method == 'lrfc' and did(['logit', 'rfc']): eprint('Already did parameter search for lrfc') return elif method == 'svm' and did(['linear-svm', 'poly-svm', 'rbf-svm']): eprint('Already did parameter search for svm') return params = {'riddle': {}, 'logit': {}, 'rfc': {}, 'linear-svm': {}, 'poly-svm': {}, 'rbf-svm': {}} X, y, perm_indices, nb_features, nb_classes = get_base_data(data_path, prop_missing) for k_idx in range(0, k): print('-' * 72) print('Partition k = {}'.format(k_idx)) data_partition_dict = emr.get_k_fold_partition(X, y, k_idx=k_idx, k=k, perm_indices=perm_indices) X_train = data_partition_dict['X_train'] y_train = data_partition_dict['y_train'] X_val = data_partition_dict['X_val'] y_val = data_partition_dict['y_val'] # cap number of validation samples if max_nb_samples != None and len(X_val)> max_nb_samples: X_val = X_val[0:max_nb_samples] y_val = y_val[0:max_nb_samples] if method != 'riddle': selected_feat_indices = select_feats(X_train + X_val, y_train + y_val, nb_features=nb_features) X_val, y_val = preproc_for_sklearn(X_val, y_val, nb_features=nb_features) X_val = X_val[:, selected_feat_indices] if method == 'riddle': start = time.time() model_module = models.deep_mlp riddle_param_dist = {'learning_rate': UniformLogSpace(10, lo=-6, hi=-1)} params['riddle'][k_idx] = parameter_tuning.random_search(model_module, riddle_param_dist, X_val, y_val, nb_features=nb_features, nb_classes=nb_classes, k=3, process_X_data_func_args={'nb_features': nb_features}, process_y_data_func_args={'nb_classes': nb_classes}, nb_searches=nb_searches) print('Best parameters for RIDDLE: {} found in {:.3f} s' .format(params['riddle'][k_idx], time.time() - start)) elif method == 'lrfc': # logistic regression start = time.time() logit_param_dist = {'C': UniformLogSpace()} logit_estimator = LogisticRegression(multi_class='multinomial', solver='lbfgs') params['logit'][k_idx] = parameter_search(X_val, y_val, estimator=logit_estimator, search=RandomizedSearchCV, dist_or_grid=logit_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for logistic regression: {} found in {:.3f} s' .format(params['logit'][k_idx], time.time() - start)) # random forest classifier start = time.time() rfc_param_dist = {'max_features': ['sqrt', 'log2'], \ 'max_depth': UniformLogSpace(base=2, lo=2, hi=9)} rfc_estimator = RandomForestClassifier() params['rfc'][k_idx] = parameter_search(X_val, y_val, estimator=rfc_estimator, search=RandomizedSearchCV, dist_or_grid=rfc_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for random forest: {} found in {:.3f} s' .format(params['rfc'][k_idx], time.time() - start)) elif method == 'svm': # linear SVM start = time.time() linear_svm_param_dist = {'C': UniformLogSpace()} linear_svm_estimator = SVC(kernel='linear', probability=True) params['linear-svm'][k_idx] = parameter_search(X_val, y_val, estimator=linear_svm_estimator, search=RandomizedSearchCV, dist_or_grid=linear_svm_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for linear SVM: {} found in {:.3f} s' .format(params['linear-svm'][k_idx], time.time() - start)) if skip_nonlinear_svm: continue # skip nonlinear_svm_param_dist = {'C': UniformLogSpace(), 'gamma': UniformLogSpace(base=10, lo=-5, hi=1)} # polynomial SVM start = time.time() poly_svm_estimator = SVC(kernel='poly', probability=True) params['poly-svm'][k_idx] = parameter_search(X_val, y_val, estimator=poly_svm_estimator, search=RandomizedSearchCV, dist_or_grid=nonlinear_svm_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for polynomial SVM: {} found in {:.3f} s' .format(params['poly-svm'][k_idx], time.time() - start)) # RBF SVM start = time.time() rbf_svm_estimator = SVC(kernel='rbf', probability=True) params['rbf-svm'][k_idx] = parameter_search(X_val, y_val, estimator=rbf_svm_estimator, search=RandomizedSearchCV, dist_or_grid=nonlinear_svm_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for RBF SVM: {} found in {:.3f} s' .format(params['rbf-svm'][k_idx], time.time() - start)) else: raise ValueError('unknown method: {}'.format(method)) # save for method_name, sub_param_dict in params.items(): if len(sub_param_dict) > 0: pickle_object(sub_param_dict, '{}/{}_{}_{}_param.pkl'.format( CACHE_DIR, method_name, data_fn, prop_missing)) print('Finished parameter search for method: {}'.format(method))