def multivariate_gaussian(x_train, y_train, x_test, y_test, outdir):
    # Score with Multivariate Gaussian

    # Transform data using boxcox transform, and fit multivariate gaussians.
    x_train_boxcox, x_test_boxcox = mv.transform_features(x_train, x_test)
    rv_pos, rv_neg = mv.fit_gaussians(x_train_boxcox, y_train)

    # Compute melodiness scores on train and test set
    m_train, m_test = mv.compute_all_melodiness(x_train_boxcox, x_test_boxcox,
                                                rv_pos, rv_neg)

    # Compute various metrics based on melodiness scores.
    melodiness_scores = mv.melodiness_metrics(m_train, m_test, y_train, y_test)
    best_thresh, max_fscore, thresh_plot_data = \
        eu.get_best_threshold(y_test, m_test) # THIS SHOULD PROBABLY BE VALIDATION NUMBERS...

    # thresh_plot_data = pd.DataFrame(np.array(thresh_plot_data).transpose(),
    #                                 columns=['recall', 'precision',
    #                                          'thresh', 'f1'])
    # fpath = os.path.join(outdir, 'thresh_plot_data.csv')
    # thresh_plot_data.to_csv(fpath)

    melodiness_scores = pd.DataFrame.from_dict(melodiness_scores)
    fpath = os.path.join(outdir, 'melodiness_scores.csv')
    melodiness_scores.to_csv(fpath)

    print "Melodiness best thresh = %s" % best_thresh
    print "Melodiness max f1 score = %s" % max_fscore
    print "overall melodiness scores:"
    print melodiness_scores
def multivariate_gaussian(x_train, y_train, x_test, y_test, outdir):
    # Score with Multivariate Gaussian

    # Transform data using boxcox transform, and fit multivariate gaussians.
    x_train_boxcox, x_test_boxcox = mv.transform_features(x_train, x_test)
    rv_pos, rv_neg = mv.fit_gaussians(x_train_boxcox, y_train)

    # Compute melodiness scores on train and test set
    m_train, m_test = mv.compute_all_melodiness(x_train_boxcox, x_test_boxcox,
                                                rv_pos, rv_neg)

    # Compute various metrics based on melodiness scores.
    melodiness_scores = mv.melodiness_metrics(m_train, m_test, y_train, y_test)
    best_thresh, max_fscore, thresh_plot_data = \
        eu.get_best_threshold(y_test, m_test) # THIS SHOULD PROBABLY BE VALIDATION NUMBERS...

    # thresh_plot_data = pd.DataFrame(np.array(thresh_plot_data).transpose(),
    #                                 columns=['recall', 'precision',
    #                                          'thresh', 'f1'])
    # fpath = os.path.join(outdir, 'thresh_plot_data.csv')
    # thresh_plot_data.to_csv(fpath)

    melodiness_scores = pd.DataFrame.from_dict(melodiness_scores)
    fpath = os.path.join(outdir, 'melodiness_scores.csv')
    melodiness_scores.to_csv(fpath)

    print "Melodiness best thresh = %s" % best_thresh
    print "Melodiness max f1 score = %s" % max_fscore
    print "overall melodiness scores:"
    print melodiness_scores
def classifier(x_train, y_train, x_valid, y_valid, x_test, y_test, outdir):
    """ Train Classifier
    """

    # Cross Validation
    best_depth, _, cv_plot_data = cu.cross_val_sweep(x_train, y_train)
    print "Classifier best depth = %s" % best_depth

    cv_plot_data = pd.DataFrame(np.array(cv_plot_data).transpose(),
                                columns=['max depth', 'accuracy', 'std'])
    fpath = os.path.join(outdir, 'cv_plot_data.csv')
    cv_plot_data.to_csv(fpath)

    # Training
    clf = cu.train_clf(x_train, y_train, best_depth)

    # Predict and Score
    p_train, p_valid, p_test = cu.clf_predictions(x_train, x_valid, x_test,
                                                  clf)
    clf_scores = cu.clf_metrics(p_train, p_test, y_train, y_test)
    print "Classifier scores:"
    print clf_scores

    # Get threshold that maximizes F1 score
    best_thresh, max_fscore, thresh_plot_data = \
        eu.get_best_threshold(y_valid, p_valid)

    # thresh_plot_data = pd.DataFrame(np.array(thresh_plot_data).transpose(),
    #                                 columns=['recall', 'precision',
    #                                          'thresh', 'f1'])
    # fpath = os.path.join(outdir, 'thresh_plot_data.csv')
    # thresh_plot_data.to_csv(fpath)

    clf_scores = pd.DataFrame.from_dict(clf_scores)
    fpath = os.path.join(outdir, 'classifier_scores.csv')
    clf_scores.to_csv(fpath)

    clf_outdir = os.path.join(outdir, 'classifier')
    if not os.path.exists(clf_outdir):
        os.mkdir(clf_outdir)
    clf_fpath = os.path.join(clf_outdir, 'rf_clf.pkl')
    joblib.dump(clf, clf_fpath)

    print "Classifier best threshold = %s" % best_thresh
    print "Classifier maximum f1 score = %s" % max_fscore

    return clf, best_thresh
def classifier(x_train, y_train, x_valid, y_valid, x_test, y_test, outdir):
    """ Train Classifier
    """

    # Cross Validation
    best_depth, _, cv_plot_data = cu.cross_val_sweep(x_train, y_train)
    print "Classifier best depth = %s" % best_depth

    cv_plot_data = pd.DataFrame(np.array(cv_plot_data).transpose(),
                                columns=['max depth', 'accuracy', 'std'])
    fpath = os.path.join(outdir, 'cv_plot_data.csv')
    cv_plot_data.to_csv(fpath)

    # Training
    clf = cu.train_clf(x_train, y_train, best_depth)

    # Predict and Score
    p_train, p_valid, p_test = cu.clf_predictions(x_train, x_valid, x_test, clf)
    clf_scores = cu.clf_metrics(p_train, p_test, y_train, y_test)
    print "Classifier scores:"
    print clf_scores

    # Get threshold that maximizes F1 score
    best_thresh, max_fscore, thresh_plot_data = \
        eu.get_best_threshold(y_valid, p_valid)

    # thresh_plot_data = pd.DataFrame(np.array(thresh_plot_data).transpose(),
    #                                 columns=['recall', 'precision',
    #                                          'thresh', 'f1'])
    # fpath = os.path.join(outdir, 'thresh_plot_data.csv')
    # thresh_plot_data.to_csv(fpath)

    clf_scores = pd.DataFrame.from_dict(clf_scores)
    fpath = os.path.join(outdir, 'classifier_scores.csv')
    clf_scores.to_csv(fpath)

    clf_outdir = os.path.join(outdir, 'classifier')
    if not os.path.exists(clf_outdir):
        os.mkdir(clf_outdir)
    clf_fpath = os.path.join(clf_outdir, 'rf_clf.pkl')
    joblib.dump(clf, clf_fpath)

    print "Classifier best threshold = %s" % best_thresh
    print "Classifier maximum f1 score = %s" % max_fscore

    return clf, best_thresh