Example #1
0
def multivariate_gaussian(x_train, y_train, x_test, y_test, outdir):
    # Score with Multivariate Gaussian

    # Transform data using boxcox transform, and fit multivariate gaussians.
    x_train_boxcox, x_test_boxcox = mv.transform_features(x_train, x_test)
    rv_pos, rv_neg = mv.fit_gaussians(x_train_boxcox, y_train)

    # Compute melodiness scores on train and test set
    m_train, m_test = mv.compute_all_melodiness(x_train_boxcox, x_test_boxcox,
                                                rv_pos, rv_neg)

    # Compute various metrics based on melodiness scores.
    melodiness_scores = mv.melodiness_metrics(m_train, m_test, y_train, y_test)
    best_thresh, max_fscore, thresh_plot_data = \
        eu.get_best_threshold(y_test, m_test) # THIS SHOULD PROBABLY BE VALIDATION NUMBERS...

    # thresh_plot_data = pd.DataFrame(np.array(thresh_plot_data).transpose(),
    #                                 columns=['recall', 'precision',
    #                                          'thresh', 'f1'])
    # fpath = os.path.join(outdir, 'thresh_plot_data.csv')
    # thresh_plot_data.to_csv(fpath)

    melodiness_scores = pd.DataFrame.from_dict(melodiness_scores)
    fpath = os.path.join(outdir, 'melodiness_scores.csv')
    melodiness_scores.to_csv(fpath)

    print "Melodiness best thresh = %s" % best_thresh
    print "Melodiness max f1 score = %s" % max_fscore
    print "overall melodiness scores:"
    print melodiness_scores
Example #2
0
def classifier(x_train, y_train, x_valid, y_valid, x_test, y_test, outdir):
    """ Train Classifier
    """

    # Cross Validation
    best_depth, _, cv_plot_data = cu.cross_val_sweep(x_train, y_train)
    print "Classifier best depth = %s" % best_depth

    cv_plot_data = pd.DataFrame(np.array(cv_plot_data).transpose(),
                                columns=['max depth', 'accuracy', 'std'])
    fpath = os.path.join(outdir, 'cv_plot_data.csv')
    cv_plot_data.to_csv(fpath)

    # Training
    clf = cu.train_clf(x_train, y_train, best_depth)

    # Predict and Score
    p_train, p_valid, p_test = cu.clf_predictions(x_train, x_valid, x_test,
                                                  clf)
    clf_scores = cu.clf_metrics(p_train, p_test, y_train, y_test)
    print "Classifier scores:"
    print clf_scores

    # Get threshold that maximizes F1 score
    best_thresh, max_fscore, thresh_plot_data = \
        eu.get_best_threshold(y_valid, p_valid)

    # thresh_plot_data = pd.DataFrame(np.array(thresh_plot_data).transpose(),
    #                                 columns=['recall', 'precision',
    #                                          'thresh', 'f1'])
    # fpath = os.path.join(outdir, 'thresh_plot_data.csv')
    # thresh_plot_data.to_csv(fpath)

    clf_scores = pd.DataFrame.from_dict(clf_scores)
    fpath = os.path.join(outdir, 'classifier_scores.csv')
    clf_scores.to_csv(fpath)

    clf_outdir = os.path.join(outdir, 'classifier')
    if not os.path.exists(clf_outdir):
        os.mkdir(clf_outdir)
    clf_fpath = os.path.join(clf_outdir, 'rf_clf.pkl')
    joblib.dump(clf, clf_fpath)

    print "Classifier best threshold = %s" % best_thresh
    print "Classifier maximum f1 score = %s" % max_fscore

    return clf, best_thresh
Example #3
0
def train_and_classify(mdb_files, train, test, dset_contour_dict,
                       dset_annot_dict):
    '''
            
            - cross validate best depth of Randon Forest Classifier: cu.cross_val_sweep

            - classify all contours and get scikitlearn metrics: cu.clf_predictions

            - get threshold with best f-measure on validation dataset get_best_threshold(Y_valid, P_valid) on validation

            - classify test contours : contour_probs

            - melody decoding: gm.melody_from_cl
            labeling should be already done
            '''
    random.shuffle(train)
    n_train = len(train) - (len(test) / 2)
    train_tracks = mdb_files[train[:n_train]]
    valid_tracks = mdb_files[train[n_train:]]
    test_tracks = mdb_files[test]

    train_contour_dict = {k: dset_contour_dict[k] for k in train_tracks}
    valid_contour_dict = {k: dset_contour_dict[k] for k in valid_tracks}
    test_contour_dict = {k: dset_contour_dict[k] for k in test_tracks}

    train_annot_dict = {k: dset_annot_dict[k] for k in train_tracks}
    valid_annot_dict = {k: dset_annot_dict[k] for k in valid_tracks}
    test_annot_dict = {k: dset_annot_dict[k] for k in test_tracks}

    reload(eu)
    partial_olap_stats, zero_olap_stats = eu.olap_stats(train_contour_dict)
    print 'overlapped stats on train data...'
    print partial_olap_stats

    len(train_contour_dict)

    reload(cc)

    anyContourDataFrame = dset_contour_dict[dset_contour_dict.keys()[0]]

    #### CONVERT PANDAS DATA to DATA for scikit Learn
    feats, idxStartFeatures, idxEndFeatures = getFeatureInfo(
        anyContourDataFrame)
    print 'idxStartFeatures'
    print idxStartFeatures
    print 'idxEndFeatures'
    print idxEndFeatures

    X_train, Y_train = cc.pd_to_sklearn(train_contour_dict, idxStartFeatures,
                                        idxEndFeatures)
    from numpy import inf
    idx = X_train == -inf
    X_train[idx] = 0
    X_train = np.nan_to_num(X_train)
    X_valid, Y_valid = cc.pd_to_sklearn(valid_contour_dict, idxStartFeatures,
                                        idxEndFeatures)
    X_valid = np.nan_to_num(X_valid)
    X_test, Y_test = cc.pd_to_sklearn(test_contour_dict, idxStartFeatures,
                                      idxEndFeatures)
    X_test = np.nan_to_num(X_test)
    np.max(X_train, 0)

    #####################  cross-val of best depth of RFC
    reload(cu)
    best_depth, max_cv_accuracy, plot_dat = cu.cross_val_sweep(X_train,
                                                               Y_train,
                                                               plot=False)
    print "best depth is {}".format(best_depth)
    print "max_cv_accuracy is {}".format(max_cv_accuracy)

    df = pd.DataFrame(np.array(plot_dat).transpose(),
                      columns=['max depth', 'accuracy', 'std'])

    ##################### 3.2 TRAIN and CLASSIFY
    clf = cu.train_clf(X_train, Y_train, best_depth)

    reload(cu)
    P_train, P_valid, P_test = cu.clf_predictions(X_train, X_valid, X_test,
                                                  clf)
    clf_scores = cu.clf_metrics(P_train, P_test, Y_train, Y_test)
    print clf_scores['test']

    #### get threshold with best f-measure on validation dataset
    reload(eu)
    best_thresh, max_fscore, plot_data = eu.get_best_threshold(
        Y_valid, P_valid)
    max_fscore = 0.0
    print "best threshold = %s" % best_thresh
    print "maximum achieved f score = %s" % max_fscore

    # classify and add the melody probability for each contour as a field in the dict
    for key in test_contour_dict.keys():
        test_contour_dict[key] = eu.contour_probs(clf, test_contour_dict[key],
                                                  idxStartFeatures,
                                                  idxEndFeatures)

    ################### 3.3. Melody decoding.
    #####  viterbi decoding
    reload(gm)
    mel_output_dict = {}
    for i, key in enumerate(test_contour_dict.keys()):
        print key
        mel_output_dict[key] = gm.melody_from_clf(test_contour_dict[key],
                                                  prob_thresh=best_thresh)

#             mel_output_dict[key] = contours_to_vocal(test_contour_dict[key], prob_thresh=best_thresh)
    return mel_output_dict, test_annot_dict, clf, feats
        print best_depth
        print max_cv_accuracy

        df = pd.DataFrame(np.array(plot_dat).transpose(),
                          columns=['max depth', 'accuracy', 'std'])

        clf = cu.train_clf(X_train, Y_train, best_depth)

        reload(cu)
        P_train, P_valid, P_test = cu.clf_predictions(X_train, X_valid, X_test,
                                                      clf)
        clf_scores = cu.clf_metrics(P_train, P_test, Y_train, Y_test)
        print clf_scores['test']

        reload(eu)
        best_thresh, max_fscore, plot_data = eu.get_best_threshold(
            Y_valid, P_valid)
        print "besth threshold = %s" % best_thresh
        print "maximum achieved f score = %s" % max_fscore

        for key in test_contour_dict.keys():
            test_contour_dict[key] = eu.contour_probs(clf,
                                                      test_contour_dict[key],
                                                      idxStartFeatures,
                                                      idxEndFeatures)

        reload(gm)
        mel_output_dict = {}
        for i, key in enumerate(test_contour_dict.keys()):
            print key
            mel_output_dict[key] = gm.melody_from_clf(test_contour_dict[key],
                                                      prob_thresh=best_thresh)