def classify_digits(): r"""Test that a nontrivial hierarchy leaf classification behaves as expected. We build the following class hierarchy along with data from the handwritten digits dataset: <ROOT> / \ A B / \ | \ 1 7 C 9 / \ 3 8 """ class_hierarchy = { ROOT: ["A", "B"], "A": ["1", "7"], "B": ["C", "9"], "C": ["3", "8"], } base_estimator = make_pipeline( TruncatedSVD(n_components=24), svm.SVC( gamma=0.001, kernel="rbf", probability=True ), ) clf = HierarchicalClassifier( base_estimator=base_estimator, class_hierarchy=class_hierarchy, ) X, y = make_digits_dataset( targets=[1, 7, 3, 8, 9], as_str=False, ) print(type(X), X.shape, X) # cast the targets to strings so we have consistent typing of labels across hierarchy y = y.astype(str) print(type(y[0]), y.shape, y) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=RANDOM_STATE, ) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print("Classification Report:\n", classification_report(y_test, y_pred)) # Demonstrate using our hierarchical metrics module with MLB wrapper with multi_labeled(y_test, y_pred, clf.graph_) as (y_test_, y_pred_, graph_): h_fbeta = h_fbeta_score( y_test_, y_pred_, graph_, ) print("h_fbeta_score: ", h_fbeta)
def custom_h_fbeta(y_true, y_pred, graph=None): with multi_labeled(y_true, y_pred, graph) as (y_test_, y_pred_, graph_, classes_): h_prec, h_rec, h_fbeta = h_fbeta_score( y_test_, y_pred_, graph_, ) return h_fbeta
def test_h_scores(graph, y_true, y_pred, expected_hr_score, expected_hp_score, expected_hf1_score): """Test the hR, hP, hF1 metrics on a few synthetic data test cases.""" with multi_labeled(y_true, y_pred, graph) as (y_true_, y_pred_, graph_): assert_that( h_recall_score(y_true=y_true_, y_pred=y_pred_, class_hierarchy=graph_), is_(close_to(expected_hr_score, delta=0.0001)), ) assert_that( h_precision_score(y_true=y_true_, y_pred=y_pred_, class_hierarchy=graph_), is_(close_to(expected_hp_score, delta=0.0001)), ) assert_that( h_fbeta_score(y_true=y_true_, y_pred=y_pred_, class_hierarchy=graph_), is_(close_to(expected_hf1_score, delta=0.0001)), )
def main(argv): infile = argv[0] outdir = argv[1] if not os.path.exists(outdir): os.makedirs(outdir) # Read data file and retain data only corresponding to 5 sleep states df = pd.read_csv(infile, dtype={ 'label': object, 'user': object, 'position': object, 'dataset': object }) orig_cols = df.columns sleep_states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM'] df = df[df['label'].isin(sleep_states)].reset_index() df = df[df['dataset'] == 'UPenn'].reset_index() df = df[orig_cols] print('... Number of data samples: %d' % len(df)) ctr = Counter(df['label']) for cls in ctr: print('%s: %d (%0.2f%%)' % (cls, ctr[cls], ctr[cls] * 100.0 / len(df))) feat_cols = ['ENMO_mean','ENMO_std','ENMO_min','ENMO_max','ENMO_mad','ENMO_entropy1','ENMO_entropy2', 'ENMO_prevdiff', 'ENMO_nextdiff', \ 'angz_mean','angz_std','angz_min','angz_max','angz_mad','angz_entropy1','angz_entropy2', 'angz_prevdiff', 'angz_nextdiff', \ 'LIDS_mean','LIDS_std','LIDS_min','LIDS_max','LIDS_mad','LIDS_entropy1','LIDS_entropy2', 'LIDS_prevdiff', 'LIDS_nextdiff'] X = df[feat_cols].values y = df['label'] groups = df['user'] # Class hierarchy for sleep stages class_hierarchy = { ROOT: {"Wake", "Sleep"}, "Sleep": {"NREM", "REM"}, "NREM": {"Light", "NREM 3"}, "Light": {"NREM 1", "NREM 2"} } graph = DiGraph(class_hierarchy) outer_cv_splits = 5 inner_cv_splits = 3 factor = 10.0 results = { 'Wake': { 'precision': [], 'recall': [], 'fbeta': [] }, 'Sleep': { 'precision': [], 'recall': [], 'fbeta': [] }, 'REM': { 'precision': [], 'recall': [], 'fbeta': [] }, 'NREM': { 'precision': [], 'recall': [], 'fbeta': [] }, 'NREM 3': { 'precision': [], 'recall': [], 'fbeta': [] }, 'Light': { 'precision': [], 'recall': [], 'fbeta': [] }, 'NREM 1': { 'precision': [], 'recall': [], 'fbeta': [] }, 'NREM 2': { 'precision': [], 'recall': [], 'fbeta': [] }, 'Overall': { 'precision': [], 'recall': [], 'fbeta': [] } } # Outer CV group_kfold = GroupKFold(n_splits=outer_cv_splits) out_fold = 0 hierarchical_pred = [] for train_indices, test_indices in group_kfold.split(X, y, groups): out_fold += 1 print('Processing fold ' + str(out_fold)) out_fold_X_train = X[train_indices, :] out_fold_X_test = X[test_indices, :] out_fold_y_train = y[train_indices] out_fold_y_test = y[test_indices] out_fold_users_test = groups[test_indices] # Create a pipeline with scaler and hierarchical classifier pipe = Pipeline([ ('scaler', StandardScaler()), ( 'clf', HierarchicalClassifier( base_estimator=RandomForestClassifier(random_state=0, n_estimators=100, n_jobs=-1), class_hierarchy=class_hierarchy, prediction_depth='mlnp', progress_wrapper=tqdm, #stopping_criteria=0.7 )) ]) # Inner CV strat_kfold = StratifiedKFold(n_splits=inner_cv_splits, random_state=0, shuffle=True) custom_cv_indices = [] for grp_train_idx, grp_test_idx in strat_kfold.split( out_fold_X_train, out_fold_y_train): custom_cv_indices.append((grp_train_idx, grp_test_idx)) print('Training') search_params = {'clf__base_estimator__n_estimators':[50,100,200,300,500], \ 'clf__base_estimator__max_depth': [5,10,None]} cv_clf = RandomizedSearchCV(estimator=pipe, param_distributions=search_params, \ cv=custom_cv_indices, scoring=make_scorer(custom_h_fbeta,graph=graph), n_iter=5, \ n_jobs=-1, verbose=1) cv_clf.fit(out_fold_X_train, out_fold_y_train) print('Predicting') out_fold_y_pred = cv_clf.predict(out_fold_X_test) best_clf = cv_clf.best_estimator_ # Demonstrate using our hierarchical metrics module with MLB wrapper with multi_labeled(out_fold_y_test, out_fold_y_pred, best_clf.named_steps['clf'].graph_) \ as (y_test_, y_pred_, graph_, classes_): fold_h_prec, fold_h_rec, fold_h_fbeta = h_fbeta_score( y_test_, y_pred_, graph_) results['Overall']['precision'].append(fold_h_prec) results['Overall']['recall'].append(fold_h_rec) results['Overall']['fbeta'].append(fold_h_fbeta) print("Fold %d: precision: %0.4f, recall: %0.4f, fbeta: %0.4f" % (out_fold, fold_h_prec, fold_h_rec, fold_h_fbeta)) y_test_ = fill_ancestors(y_test_, graph=graph_) y_pred_ = fill_ancestors(y_pred_, graph=graph_) hierarchical_pred.append( (out_fold_users_test, y_test_, y_pred_, classes_)) fold_wake_prec, fold_wake_rec, fold_wake_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'Wake') fold_sleep_prec, fold_sleep_rec, fold_sleep_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'Sleep') fold_rem_prec, fold_rem_rec, fold_rem_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'REM') fold_nrem_prec, fold_nrem_rec, fold_nrem_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'NREM') fold_nrem3_prec, fold_nrem3_rec, fold_nrem3_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'NREM 3') fold_light_prec, fold_light_rec, fold_light_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'Light') fold_nrem1_prec, fold_nrem1_rec, fold_nrem1_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'NREM 1') fold_nrem2_prec, fold_nrem2_rec, fold_nrem2_fbeta, _ = get_node_metrics( y_test_, y_pred_, classes_, 'NREM 2') results['Wake']['precision'].append(fold_wake_prec) results['Wake']['recall'].append(fold_wake_rec) results['Wake']['fbeta'].append(fold_wake_fbeta) results['Sleep']['precision'].append(fold_sleep_prec) results['Sleep']['recall'].append(fold_sleep_rec) results['Sleep']['fbeta'].append(fold_sleep_fbeta) results['REM']['precision'].append(fold_rem_prec) results['REM']['recall'].append(fold_rem_rec) results['REM']['fbeta'].append(fold_rem_fbeta) results['NREM']['precision'].append(fold_nrem_prec) results['NREM']['recall'].append(fold_nrem_rec) results['NREM']['fbeta'].append(fold_nrem_fbeta) results['NREM 3']['precision'].append(fold_nrem3_prec) results['NREM 3']['recall'].append(fold_nrem3_rec) results['NREM 3']['fbeta'].append(fold_nrem3_fbeta) results['Light']['precision'].append(fold_light_prec) results['Light']['recall'].append(fold_light_rec) results['Light']['fbeta'].append(fold_light_fbeta) results['NREM 1']['precision'].append(fold_nrem1_prec) results['NREM 1']['recall'].append(fold_nrem1_rec) results['NREM 1']['fbeta'].append(fold_nrem1_fbeta) results['NREM 2']['precision'].append(fold_nrem2_prec) results['NREM 2']['recall'].append(fold_nrem2_rec) results['NREM 2']['fbeta'].append(fold_nrem2_fbeta) get_classification_report(results) save_user_report(hierarchical_pred, os.path.join(outdir, 'hierarchical_results.csv'))
def main(argv): infile = argv[0] modeldir = argv[1] mode = argv[2] ensemble = int(argv[3]) # 0 - use best model, 1 - use ensemble outdir = argv[4] df = pd.read_csv(infile) method = 'feat_eng' if mode == 'binary': states = ['Wake', 'Sleep'] collate_states = ['NREM 1', 'NREM 2', 'NREM 3', 'REM'] df.loc[df['label'].isin(collate_states), 'label'] = 'Sleep' elif mode == 'nonwear': states = ['Wear', 'Nonwear'] collate_states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM'] df.loc[df['label'].isin(collate_states), 'label'] = 'Wear' elif mode == 'multiclass': states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM'] elif mode == 'hierarchical': method = 'hierarchical' states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM', 'Nonwear'] # Class hierarchy for sleep stages class_hierarchy = { ROOT: {"Wear", "Nonwear"}, "Wear": {"Wake", "Sleep"}, "Sleep": {"NREM", "REM"}, "NREM": {"Light", "NREM 3"}, "Light": {"NREM 1", "NREM 2"} } graph = DiGraph(class_hierarchy) classes = [node for node in graph.nodes if node != ROOT] df = df[df['label'].isin(states)].reset_index() feat_cols = [ 'ENMO_mean', 'ENMO_std', 'ENMO_range', 'ENMO_mad', 'ENMO_entropy1', 'ENMO_entropy2', 'ENMO_prev30diff', 'ENMO_next30diff', 'ENMO_prev60diff', 'ENMO_next60diff', 'ENMO_prev120diff', 'ENMO_next120diff', 'angz_mean', 'angz_std', 'angz_range', 'angz_mad', 'angz_entropy1', 'angz_entropy2', 'angz_prev30diff', 'angz_next30diff', 'angz_prev60diff', 'angz_next60diff', 'angz_prev120diff', 'angz_next120diff', 'LIDS_mean', 'LIDS_std', 'LIDS_range', 'LIDS_mad', 'LIDS_entropy1', 'LIDS_entropy2', 'LIDS_prev30diff', 'LIDS_next30diff', 'LIDS_prev60diff', 'LIDS_next60diff', 'LIDS_prev120diff', 'LIDS_next120diff' ] ts_test = df['timestamp'] x_test = df[feat_cols].values y_test = df['label'] if mode != 'hierarchical': y_test = np.array([states.index(i) for i in y_test]) users_test = df['user'] fnames_test = df['filename'] N = x_test.shape[0] if ensemble: model_fnames = os.listdir(modeldir) model_fnames = [fname for fname in model_fnames if mode in fname] nfolds = len(model_fnames) for fold, fname in enumerate(model_fnames): print('Processing fold ' + str(fold + 1)) if mode != 'hierarchical': scaler, cv_clf = joblib.load( open(os.path.join(modeldir, fname), 'rb')) x_test_sc = scaler.transform(x_test) fold_y_pred = cv_clf.predict_proba(x_test_sc) if mode == 'multiclass': fold_y_pred_collated = np.zeros( (fold_y_pred[0].shape[0], len(fold_y_pred))) for cls in range(len(fold_y_pred)): fold_y_pred_collated[:, cls] = fold_y_pred[cls][:, 1] fold_y_pred = fold_y_pred_collated else: cv_clf = pickle.load(open(os.path.join(modeldir, fname), 'rb')) cv_clf = cv_clf.best_estimator_ fold_y_pred = cv_clf.predict(x_test) fold_y_pred_prob = cv_clf.predict_proba(x_test) with multi_labeled(y_test, fold_y_pred, cv_clf.named_steps['clf'].graph_) \ as (y_test_, y_pred_, graph_, classes_): states = classes_ y_test_ = fill_ancestors(y_test_, graph=graph_) fold_y_pred_ = np.zeros(fold_y_pred_prob.shape) for new_idx, label in enumerate(classes_): old_idx = classes.index(label) fold_y_pred_[:, new_idx] = fold_y_pred_prob[:, old_idx] fold_y_pred = fold_y_pred_ # Accumulate prediction probabilities if fold == 0: y_pred = np.zeros((N, len(states))) y_pred += fold_y_pred # Get average predictions y_pred = y_pred / float(nfolds) if mode == 'hierarchical': y_test = y_test_ else: if mode != 'hierarchical': model_fnames = os.listdir(modeldir) model_fname = [fname for fname in model_fnames if mode in fname][0] scaler, clf = joblib.load( open(os.path.join(modeldir, model_fname), 'rb')) x_test_sc = scaler.transform(x_test) y_pred = clf.predict_proba(x_test_sc) # Save test results y_pred = [(users_test, ts_test, fnames_test, y_test, y_pred)] cv_save_classification_result(y_pred, states, os.path.join( outdir, mode + '_test_classification.csv'), method=method)
def main(argv): infile = argv[0] dataset = argv[1] outdir = argv[2] resultdir = os.path.join(outdir, 'models') if not os.path.exists(resultdir): os.makedirs(resultdir) # Read data file and retain data only corresponding to 5 sleep states df = pd.read_csv(infile, dtype={'label':object, 'user':object,\ 'position':object, 'dataset':object}) states = ['Wake','NREM 1','NREM 2','NREM 3','REM','Nonwear'] df = df[df['label'].isin(states)].reset_index() print('... Number of data samples: %d' % len(df)) ctr = Counter(df['label']) for cls in ctr: print('%s: %d (%0.2f%%)' % (cls,ctr[cls],ctr[cls]*100.0/len(df))) feat_cols = ['ENMO_mean','ENMO_std','ENMO_range','ENMO_mad', 'ENMO_entropy1','ENMO_entropy2', 'ENMO_prev30diff', 'ENMO_next30diff', 'ENMO_prev60diff', 'ENMO_next60diff', 'ENMO_prev120diff', 'ENMO_next120diff', 'angz_mean','angz_std','angz_range','angz_mad', 'angz_entropy1','angz_entropy2', 'angz_prev30diff', 'angz_next30diff', 'angz_prev60diff', 'angz_next60diff', 'angz_prev120diff', 'angz_next120diff', 'LIDS_mean','LIDS_std','LIDS_range','LIDS_mad', 'LIDS_entropy1','LIDS_entropy2', 'LIDS_prev30diff', 'LIDS_next30diff', 'LIDS_prev60diff', 'LIDS_next60diff', 'LIDS_prev120diff', 'LIDS_next120diff'] ts = df['timestamp'] X = df[feat_cols].values y = df['label'] #y = np.array([states.index(i) for i in y]) groups = df['user'] fnames = df['filename'] feat_len = X.shape[1] # Class hierarchy for sleep stages class_hierarchy = { ROOT : {"Wear", "Nonwear"}, "Wear" : {"Wake", "Sleep"}, "Sleep" : {"NREM", "REM"}, "NREM" : {"Light", "NREM 3"}, "Light" : {"NREM 1", "NREM 2"} } graph = DiGraph(class_hierarchy) classes = [node for node in graph.nodes if node != ROOT] outer_cv_splits = 5; inner_cv_splits = 5 factor = 10.0 # Outer CV group_kfold = GroupKFold(n_splits=outer_cv_splits) out_fold = 0 hierarchical_pred = [] for train_indices, test_indices in group_kfold.split(X,y,groups): out_fold += 1 print('Processing fold ' + str(out_fold)) out_fold_X_train = X[train_indices,:]; out_fold_X_test = X[test_indices,:] out_fold_y_train = y[train_indices]; out_fold_y_test = y[test_indices] out_fold_users_test = groups[test_indices] out_fold_ts_test = ts[test_indices] out_fold_fnames_test = fnames[test_indices] # Create a pipeline with scaler and hierarchical classifier pipe = Pipeline([('scaler', StandardScaler()), ('clf', HierarchicalClassifier( base_estimator=RandomForestClassifier(random_state=0, n_estimators=100, n_jobs=-1), class_hierarchy=class_hierarchy, prediction_depth='mlnp', progress_wrapper=tqdm, #stopping_criteria=0.7 )) ]) # Inner CV strat_kfold = StratifiedKFold(n_splits=inner_cv_splits,\ random_state=0, shuffle=True) custom_cv_indices = [] for grp_train_idx, grp_test_idx in strat_kfold.split(out_fold_X_train,out_fold_y_train): custom_cv_indices.append((grp_train_idx, grp_test_idx)) print('Training') search_params = {'clf__base_estimator__n_estimators':[50,100,200,300,500,700], \ 'clf__base_estimator__max_depth': [5,10,15,None]} cv_clf = RandomizedSearchCV(estimator=pipe, param_distributions=search_params, \ cv=custom_cv_indices, scoring=make_scorer(custom_h_fbeta,graph=graph), n_iter=5, \ n_jobs=-1, verbose=1) cv_clf.fit(out_fold_X_train, out_fold_y_train) joblib.dump(cv_clf, os.path.join(resultdir,\ 'fold'+str(out_fold)+'_hierarchical_RF.sav')) print('Predicting') out_fold_y_pred = cv_clf.predict(out_fold_X_test) out_fold_y_pred_prob = cv_clf.predict_proba(out_fold_X_test) best_clf = cv_clf.best_estimator_ # Demonstrate using our hierarchical metrics module with MLB wrapper with multi_labeled(out_fold_y_test, out_fold_y_pred, best_clf.named_steps['clf'].graph_) \ as (y_test_, y_pred_, graph_, classes_): states = classes_ y_test_ = fill_ancestors(y_test_, graph=graph_) y_pred_ = fill_ancestors(y_pred_, graph=graph_) y_pred_prob_ = np.zeros(out_fold_y_pred_prob.shape) for new_idx, label in enumerate(classes_): old_idx = classes.index(label) y_pred_prob_[:,new_idx] = out_fold_y_pred_prob[:,old_idx] hierarchical_pred.append((out_fold_users_test, out_fold_ts_test, out_fold_fnames_test, y_test_, y_pred_prob_)) cv_save_classification_result(hierarchical_pred, states, os.path.join(outdir, 'hierarchical_classification_results.csv'), method = 'hierarchical')