Beispiel #1
0
def classify_digits():
    r"""Test that a nontrivial hierarchy leaf classification behaves as expected.
    We build the following class hierarchy along with data from the handwritten digits dataset:
            <ROOT>
           /      \
          A        B
         / \       |  \
        1   7      C   9
                 /   \
                3     8
    """
    class_hierarchy = {
        ROOT: ["A", "B"],
        "A": ["1", "7"],
        "B": ["C", "9"],
        "C": ["3", "8"],
    }
    base_estimator = make_pipeline(
        TruncatedSVD(n_components=24),
        svm.SVC(
            gamma=0.001,
            kernel="rbf",
            probability=True
        ),
    )
    clf = HierarchicalClassifier(
        base_estimator=base_estimator,
        class_hierarchy=class_hierarchy,
    )
    X, y = make_digits_dataset(
        targets=[1, 7, 3, 8, 9],
        as_str=False,
    )
    print(type(X), X.shape, X)
    
    # cast the targets to strings so we have consistent typing of labels across hierarchy
    y = y.astype(str)
    print(type(y[0]), y.shape, y)
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=RANDOM_STATE,
    )

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("Classification Report:\n", classification_report(y_test, y_pred))

    # Demonstrate using our hierarchical metrics module with MLB wrapper
    with multi_labeled(y_test, y_pred, clf.graph_) as (y_test_, y_pred_, graph_):
        h_fbeta = h_fbeta_score(
            y_test_,
            y_pred_,
            graph_,
        )
        print("h_fbeta_score: ", h_fbeta)
def custom_h_fbeta(y_true, y_pred, graph=None):
    with multi_labeled(y_true, y_pred,
                       graph) as (y_test_, y_pred_, graph_, classes_):
        h_prec, h_rec, h_fbeta = h_fbeta_score(
            y_test_,
            y_pred_,
            graph_,
        )
        return h_fbeta
def test_h_scores(graph, y_true, y_pred, expected_hr_score, expected_hp_score, expected_hf1_score):
    """Test the hR, hP, hF1 metrics on a few synthetic data test cases."""
    with multi_labeled(y_true, y_pred, graph) as (y_true_, y_pred_, graph_):
        assert_that(
            h_recall_score(y_true=y_true_, y_pred=y_pred_, class_hierarchy=graph_),
            is_(close_to(expected_hr_score, delta=0.0001)),
        )
        assert_that(
            h_precision_score(y_true=y_true_, y_pred=y_pred_, class_hierarchy=graph_),
            is_(close_to(expected_hp_score, delta=0.0001)),
        )
        assert_that(
            h_fbeta_score(y_true=y_true_, y_pred=y_pred_, class_hierarchy=graph_),
            is_(close_to(expected_hf1_score, delta=0.0001)),
        )
Beispiel #4
0
def main(argv):
    infile = argv[0]
    outdir = argv[1]

    if not os.path.exists(outdir):
        os.makedirs(outdir)

    # Read data file and retain data only corresponding to 5 sleep states
    df = pd.read_csv(infile,
                     dtype={
                         'label': object,
                         'user': object,
                         'position': object,
                         'dataset': object
                     })
    orig_cols = df.columns
    sleep_states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM']
    df = df[df['label'].isin(sleep_states)].reset_index()
    df = df[df['dataset'] == 'UPenn'].reset_index()
    df = df[orig_cols]
    print('... Number of data samples: %d' % len(df))
    ctr = Counter(df['label'])
    for cls in ctr:
        print('%s: %d (%0.2f%%)' % (cls, ctr[cls], ctr[cls] * 100.0 / len(df)))

    feat_cols = ['ENMO_mean','ENMO_std','ENMO_min','ENMO_max','ENMO_mad','ENMO_entropy1','ENMO_entropy2', 'ENMO_prevdiff', 'ENMO_nextdiff', \
                 'angz_mean','angz_std','angz_min','angz_max','angz_mad','angz_entropy1','angz_entropy2', 'angz_prevdiff', 'angz_nextdiff', \
                 'LIDS_mean','LIDS_std','LIDS_min','LIDS_max','LIDS_mad','LIDS_entropy1','LIDS_entropy2', 'LIDS_prevdiff', 'LIDS_nextdiff']

    X = df[feat_cols].values
    y = df['label']
    groups = df['user']

    # Class hierarchy for sleep stages
    class_hierarchy = {
        ROOT: {"Wake", "Sleep"},
        "Sleep": {"NREM", "REM"},
        "NREM": {"Light", "NREM 3"},
        "Light": {"NREM 1", "NREM 2"}
    }

    graph = DiGraph(class_hierarchy)

    outer_cv_splits = 5
    inner_cv_splits = 3
    factor = 10.0

    results = {
        'Wake': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'Sleep': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'REM': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'NREM': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'NREM 3': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'Light': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'NREM 1': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'NREM 2': {
            'precision': [],
            'recall': [],
            'fbeta': []
        },
        'Overall': {
            'precision': [],
            'recall': [],
            'fbeta': []
        }
    }

    # Outer CV
    group_kfold = GroupKFold(n_splits=outer_cv_splits)
    out_fold = 0
    hierarchical_pred = []
    for train_indices, test_indices in group_kfold.split(X, y, groups):
        out_fold += 1
        print('Processing fold ' + str(out_fold))
        out_fold_X_train = X[train_indices, :]
        out_fold_X_test = X[test_indices, :]
        out_fold_y_train = y[train_indices]
        out_fold_y_test = y[test_indices]
        out_fold_users_test = groups[test_indices]

        # Create a pipeline with scaler and hierarchical classifier
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            (
                'clf',
                HierarchicalClassifier(
                    base_estimator=RandomForestClassifier(random_state=0,
                                                          n_estimators=100,
                                                          n_jobs=-1),
                    class_hierarchy=class_hierarchy,
                    prediction_depth='mlnp',
                    progress_wrapper=tqdm,
                    #stopping_criteria=0.7
                ))
        ])

        # Inner CV
        strat_kfold = StratifiedKFold(n_splits=inner_cv_splits,
                                      random_state=0,
                                      shuffle=True)

        custom_cv_indices = []
        for grp_train_idx, grp_test_idx in strat_kfold.split(
                out_fold_X_train, out_fold_y_train):
            custom_cv_indices.append((grp_train_idx, grp_test_idx))

        print('Training')
        search_params = {'clf__base_estimator__n_estimators':[50,100,200,300,500], \
             'clf__base_estimator__max_depth': [5,10,None]}
        cv_clf = RandomizedSearchCV(estimator=pipe, param_distributions=search_params, \
                           cv=custom_cv_indices, scoring=make_scorer(custom_h_fbeta,graph=graph), n_iter=5, \
                           n_jobs=-1, verbose=1)
        cv_clf.fit(out_fold_X_train, out_fold_y_train)
        print('Predicting')
        out_fold_y_pred = cv_clf.predict(out_fold_X_test)

        best_clf = cv_clf.best_estimator_

        # Demonstrate using our hierarchical metrics module with MLB wrapper
        with multi_labeled(out_fold_y_test, out_fold_y_pred, best_clf.named_steps['clf'].graph_) \
                                as (y_test_, y_pred_, graph_, classes_):
            fold_h_prec, fold_h_rec, fold_h_fbeta = h_fbeta_score(
                y_test_, y_pred_, graph_)
            results['Overall']['precision'].append(fold_h_prec)
            results['Overall']['recall'].append(fold_h_rec)
            results['Overall']['fbeta'].append(fold_h_fbeta)
            print("Fold %d: precision: %0.4f, recall: %0.4f, fbeta: %0.4f" %
                  (out_fold, fold_h_prec, fold_h_rec, fold_h_fbeta))

            y_test_ = fill_ancestors(y_test_, graph=graph_)
            y_pred_ = fill_ancestors(y_pred_, graph=graph_)

            hierarchical_pred.append(
                (out_fold_users_test, y_test_, y_pred_, classes_))

            fold_wake_prec, fold_wake_rec, fold_wake_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'Wake')
            fold_sleep_prec, fold_sleep_rec, fold_sleep_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'Sleep')
            fold_rem_prec, fold_rem_rec, fold_rem_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'REM')
            fold_nrem_prec, fold_nrem_rec, fold_nrem_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'NREM')
            fold_nrem3_prec, fold_nrem3_rec, fold_nrem3_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'NREM 3')
            fold_light_prec, fold_light_rec, fold_light_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'Light')
            fold_nrem1_prec, fold_nrem1_rec, fold_nrem1_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'NREM 1')
            fold_nrem2_prec, fold_nrem2_rec, fold_nrem2_fbeta, _ = get_node_metrics(
                y_test_, y_pred_, classes_, 'NREM 2')

            results['Wake']['precision'].append(fold_wake_prec)
            results['Wake']['recall'].append(fold_wake_rec)
            results['Wake']['fbeta'].append(fold_wake_fbeta)
            results['Sleep']['precision'].append(fold_sleep_prec)
            results['Sleep']['recall'].append(fold_sleep_rec)
            results['Sleep']['fbeta'].append(fold_sleep_fbeta)
            results['REM']['precision'].append(fold_rem_prec)
            results['REM']['recall'].append(fold_rem_rec)
            results['REM']['fbeta'].append(fold_rem_fbeta)
            results['NREM']['precision'].append(fold_nrem_prec)
            results['NREM']['recall'].append(fold_nrem_rec)
            results['NREM']['fbeta'].append(fold_nrem_fbeta)
            results['NREM 3']['precision'].append(fold_nrem3_prec)
            results['NREM 3']['recall'].append(fold_nrem3_rec)
            results['NREM 3']['fbeta'].append(fold_nrem3_fbeta)
            results['Light']['precision'].append(fold_light_prec)
            results['Light']['recall'].append(fold_light_rec)
            results['Light']['fbeta'].append(fold_light_fbeta)
            results['NREM 1']['precision'].append(fold_nrem1_prec)
            results['NREM 1']['recall'].append(fold_nrem1_rec)
            results['NREM 1']['fbeta'].append(fold_nrem1_fbeta)
            results['NREM 2']['precision'].append(fold_nrem2_prec)
            results['NREM 2']['recall'].append(fold_nrem2_rec)
            results['NREM 2']['fbeta'].append(fold_nrem2_fbeta)

    get_classification_report(results)
    save_user_report(hierarchical_pred,
                     os.path.join(outdir, 'hierarchical_results.csv'))
def main(argv):
    infile = argv[0]
    modeldir = argv[1]
    mode = argv[2]
    ensemble = int(argv[3])  # 0 - use best model, 1 - use ensemble
    outdir = argv[4]

    df = pd.read_csv(infile)
    method = 'feat_eng'
    if mode == 'binary':
        states = ['Wake', 'Sleep']
        collate_states = ['NREM 1', 'NREM 2', 'NREM 3', 'REM']
        df.loc[df['label'].isin(collate_states), 'label'] = 'Sleep'
    elif mode == 'nonwear':
        states = ['Wear', 'Nonwear']
        collate_states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM']
        df.loc[df['label'].isin(collate_states), 'label'] = 'Wear'
    elif mode == 'multiclass':
        states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM']
    elif mode == 'hierarchical':
        method = 'hierarchical'
        states = ['Wake', 'NREM 1', 'NREM 2', 'NREM 3', 'REM', 'Nonwear']
        # Class hierarchy for sleep stages
        class_hierarchy = {
            ROOT: {"Wear", "Nonwear"},
            "Wear": {"Wake", "Sleep"},
            "Sleep": {"NREM", "REM"},
            "NREM": {"Light", "NREM 3"},
            "Light": {"NREM 1", "NREM 2"}
        }

        graph = DiGraph(class_hierarchy)
        classes = [node for node in graph.nodes if node != ROOT]

    df = df[df['label'].isin(states)].reset_index()

    feat_cols = [
        'ENMO_mean', 'ENMO_std', 'ENMO_range', 'ENMO_mad', 'ENMO_entropy1',
        'ENMO_entropy2', 'ENMO_prev30diff', 'ENMO_next30diff',
        'ENMO_prev60diff', 'ENMO_next60diff', 'ENMO_prev120diff',
        'ENMO_next120diff', 'angz_mean', 'angz_std', 'angz_range', 'angz_mad',
        'angz_entropy1', 'angz_entropy2', 'angz_prev30diff', 'angz_next30diff',
        'angz_prev60diff', 'angz_next60diff', 'angz_prev120diff',
        'angz_next120diff', 'LIDS_mean', 'LIDS_std', 'LIDS_range', 'LIDS_mad',
        'LIDS_entropy1', 'LIDS_entropy2', 'LIDS_prev30diff', 'LIDS_next30diff',
        'LIDS_prev60diff', 'LIDS_next60diff', 'LIDS_prev120diff',
        'LIDS_next120diff'
    ]

    ts_test = df['timestamp']
    x_test = df[feat_cols].values
    y_test = df['label']
    if mode != 'hierarchical':
        y_test = np.array([states.index(i) for i in y_test])
    users_test = df['user']
    fnames_test = df['filename']

    N = x_test.shape[0]

    if ensemble:
        model_fnames = os.listdir(modeldir)
        model_fnames = [fname for fname in model_fnames if mode in fname]
        nfolds = len(model_fnames)
        for fold, fname in enumerate(model_fnames):
            print('Processing fold ' + str(fold + 1))
            if mode != 'hierarchical':
                scaler, cv_clf = joblib.load(
                    open(os.path.join(modeldir, fname), 'rb'))
                x_test_sc = scaler.transform(x_test)
                fold_y_pred = cv_clf.predict_proba(x_test_sc)
                if mode == 'multiclass':
                    fold_y_pred_collated = np.zeros(
                        (fold_y_pred[0].shape[0], len(fold_y_pred)))
                    for cls in range(len(fold_y_pred)):
                        fold_y_pred_collated[:, cls] = fold_y_pred[cls][:, 1]
                    fold_y_pred = fold_y_pred_collated
            else:
                cv_clf = pickle.load(open(os.path.join(modeldir, fname), 'rb'))
                cv_clf = cv_clf.best_estimator_
                fold_y_pred = cv_clf.predict(x_test)
                fold_y_pred_prob = cv_clf.predict_proba(x_test)
                with multi_labeled(y_test, fold_y_pred, cv_clf.named_steps['clf'].graph_) \
                                      as (y_test_, y_pred_, graph_, classes_):
                    states = classes_
                    y_test_ = fill_ancestors(y_test_, graph=graph_)
                    fold_y_pred_ = np.zeros(fold_y_pred_prob.shape)
                    for new_idx, label in enumerate(classes_):
                        old_idx = classes.index(label)
                        fold_y_pred_[:, new_idx] = fold_y_pred_prob[:, old_idx]
                fold_y_pred = fold_y_pred_

            # Accumulate prediction probabilities
            if fold == 0:
                y_pred = np.zeros((N, len(states)))
            y_pred += fold_y_pred

        # Get average predictions
        y_pred = y_pred / float(nfolds)
        if mode == 'hierarchical':
            y_test = y_test_
    else:
        if mode != 'hierarchical':
            model_fnames = os.listdir(modeldir)
            model_fname = [fname for fname in model_fnames if mode in fname][0]
            scaler, clf = joblib.load(
                open(os.path.join(modeldir, model_fname), 'rb'))
            x_test_sc = scaler.transform(x_test)
            y_pred = clf.predict_proba(x_test_sc)

    # Save test results
    y_pred = [(users_test, ts_test, fnames_test, y_test, y_pred)]
    cv_save_classification_result(y_pred,
                                  states,
                                  os.path.join(
                                      outdir,
                                      mode + '_test_classification.csv'),
                                  method=method)
Beispiel #6
0
def main(argv):
  infile = argv[0]
  dataset = argv[1]
  outdir = argv[2]

  resultdir = os.path.join(outdir, 'models')
  if not os.path.exists(resultdir):
    os.makedirs(resultdir)

  # Read data file and retain data only corresponding to 5 sleep states
  df = pd.read_csv(infile, dtype={'label':object, 'user':object,\
                   'position':object, 'dataset':object})
  states = ['Wake','NREM 1','NREM 2','NREM 3','REM','Nonwear']
  df = df[df['label'].isin(states)].reset_index()
  
  print('... Number of data samples: %d' % len(df))
  ctr = Counter(df['label'])
  for cls in ctr:
    print('%s: %d (%0.2f%%)' % (cls,ctr[cls],ctr[cls]*100.0/len(df))) 

  feat_cols = ['ENMO_mean','ENMO_std','ENMO_range','ENMO_mad',
               'ENMO_entropy1','ENMO_entropy2', 'ENMO_prev30diff', 'ENMO_next30diff',
               'ENMO_prev60diff', 'ENMO_next60diff', 'ENMO_prev120diff', 'ENMO_next120diff',
               'angz_mean','angz_std','angz_range','angz_mad',
               'angz_entropy1','angz_entropy2', 'angz_prev30diff', 'angz_next30diff',
               'angz_prev60diff', 'angz_next60diff', 'angz_prev120diff', 'angz_next120diff',
               'LIDS_mean','LIDS_std','LIDS_range','LIDS_mad',
               'LIDS_entropy1','LIDS_entropy2', 'LIDS_prev30diff', 'LIDS_next30diff',
               'LIDS_prev60diff', 'LIDS_next60diff', 'LIDS_prev120diff', 'LIDS_next120diff']

  ts = df['timestamp']
  X = df[feat_cols].values
  y = df['label']
  #y = np.array([states.index(i) for i in y])
  groups = df['user']
  fnames = df['filename']
  feat_len = X.shape[1]

  # Class hierarchy for sleep stages
  class_hierarchy = {
    ROOT : {"Wear", "Nonwear"},
    "Wear" : {"Wake", "Sleep"},
    "Sleep" : {"NREM", "REM"},
    "NREM" : {"Light", "NREM 3"},
    "Light" : {"NREM 1", "NREM 2"} 
  }
  
  graph = DiGraph(class_hierarchy)    
  classes = [node for node in graph.nodes if node != ROOT]
 
  outer_cv_splits = 5; inner_cv_splits = 5
  factor = 10.0
  
  # Outer CV
  group_kfold = GroupKFold(n_splits=outer_cv_splits)
  out_fold = 0
  hierarchical_pred = []
  for train_indices, test_indices in group_kfold.split(X,y,groups):
    out_fold += 1
    print('Processing fold ' + str(out_fold))
    out_fold_X_train = X[train_indices,:]; out_fold_X_test = X[test_indices,:]
    out_fold_y_train = y[train_indices]; out_fold_y_test = y[test_indices]
    out_fold_users_test = groups[test_indices]
    out_fold_ts_test = ts[test_indices]
    out_fold_fnames_test = fnames[test_indices]
    
    # Create a pipeline with scaler and hierarchical classifier
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('clf', HierarchicalClassifier(
                        base_estimator=RandomForestClassifier(random_state=0, n_estimators=100, n_jobs=-1),
                        class_hierarchy=class_hierarchy,
                        prediction_depth='mlnp',
                        progress_wrapper=tqdm,
                        #stopping_criteria=0.7
                     ))
                    ])
    
    # Inner CV
    strat_kfold = StratifiedKFold(n_splits=inner_cv_splits,\
                                  random_state=0, shuffle=True)       

    custom_cv_indices = []
    for grp_train_idx, grp_test_idx in strat_kfold.split(out_fold_X_train,out_fold_y_train):
      custom_cv_indices.append((grp_train_idx, grp_test_idx))
        
    print('Training')        
    search_params = {'clf__base_estimator__n_estimators':[50,100,200,300,500,700], \
         'clf__base_estimator__max_depth': [5,10,15,None]}
    cv_clf = RandomizedSearchCV(estimator=pipe, param_distributions=search_params, \
                       cv=custom_cv_indices, scoring=make_scorer(custom_h_fbeta,graph=graph), n_iter=5, \
                       n_jobs=-1, verbose=1)
    cv_clf.fit(out_fold_X_train, out_fold_y_train)
    joblib.dump(cv_clf, os.path.join(resultdir,\
                'fold'+str(out_fold)+'_hierarchical_RF.sav'))
    print('Predicting')
    out_fold_y_pred = cv_clf.predict(out_fold_X_test)
    out_fold_y_pred_prob = cv_clf.predict_proba(out_fold_X_test)
    
    best_clf = cv_clf.best_estimator_
        
    # Demonstrate using our hierarchical metrics module with MLB wrapper
    with multi_labeled(out_fold_y_test, out_fold_y_pred, best_clf.named_steps['clf'].graph_) \
                            as (y_test_, y_pred_, graph_, classes_):
      states = classes_ 
      y_test_ = fill_ancestors(y_test_, graph=graph_)
      y_pred_ = fill_ancestors(y_pred_, graph=graph_)
      y_pred_prob_ = np.zeros(out_fold_y_pred_prob.shape)
      for new_idx, label in enumerate(classes_):
        old_idx = classes.index(label)
        y_pred_prob_[:,new_idx] = out_fold_y_pred_prob[:,old_idx]

      hierarchical_pred.append((out_fold_users_test, out_fold_ts_test, out_fold_fnames_test,
                                y_test_, y_pred_prob_))

  cv_save_classification_result(hierarchical_pred, states,
                                os.path.join(outdir, 'hierarchical_classification_results.csv'),
                                method = 'hierarchical')