Python HW3.pandas_to_data 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: CS6140_A_MacLeay.Homeworks

클래스/타입: HW3

메소드/함수: pandas_to_data

hotexamples.com에서의 예제들: 10

Python HW3.pandas_to_data - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 CS6140_A_MacLeay.Homeworks.HW3.pandas_to_data에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

partition_folds(19)

load_and_normalize_spambase(10)

pandas_to_data(10)

transpose_array(7)

get_mus(5)

get_accuracy(4)

get_covar(4)

get_data_and_mus(4)

get_sub_at_value(4)

separate_X_and_y(2)

bins_per_column(1)

set_diag_min(1)

get_std_dev(1)

bins_per_column_by_col(1)

classify_x(1)

classify(1)

univariate_normal(1)

예제 #1

파일 보기

파일: hw3_tests.py 프로젝트: alliemacleay/MachineLearning_CS6140

def testPdToDict():

    df = hw3.load_and_normalize_spambase()
    cols = df.columns[0:3]
    sub = utils.train_subset(df, cols, 5)
    print sub
    print hw3.pandas_to_data(sub)

예제 #2

파일 보기

파일: hw4.py 프로젝트: alliemacleay/MachineLearning_CS6140

def q7():
    h_test, h_train = utils.load_and_normalize_housing_set()
    housingData_test = hw3.pandas_to_data(h_test)
    housingData_train = hw3.pandas_to_data(h_train)
    y, X = hw4.split_truth_from_data(housingData_train)
    y_test, X_test = hw4.split_truth_from_data(housingData_test)
    #gb = GradientBoostingRegressor(learning_rate=.1, n_estimators=1, max_depth=1)
    gb = gradb.GradientBoostRegressor(learning_rate=.1, n_estimators=100, max_depth=1, learner=lambda: DecisionTreeRegressor(max_depth=1))
    gb.fit(X, y)
    gb.print_stats()
    yhat = gb.predict(X_test)
    print y_test[:10]
    print yhat[:10]
    print 'MSE: {}'.format(hw4.compute_mse(y_test, yhat))

예제 #3

파일 보기

파일: hw4.py 프로젝트: alliemacleay/MachineLearning_CS6140

def q1():
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    k = 10
    all_folds = hw3.partition_folds(spamData, k)
    tprs = []
    fprs = []
    for i in [0]: #range(len(all_folds)):
        kf_data, kf_test = dl.get_train_and_test(all_folds, i)
        y, X = hw4.split_truth_from_data(kf_data)
        y_test, X_test = hw4.split_truth_from_data(kf_test)
        adaboost = run_adaboost(X, y, X_test, y_test, i)
        predicted = adaboost.predict(X)
        print(roc_auc_score(y, predicted))
        for i in range(len(adaboost.snapshots)):
            round_number = i + 1
            ab = adaboost.snapshots[i]
            yt_pred = ab.predict(X_test)
            round_err = float(np.sum([1 if yt==yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test)
            adaboost.adaboost_error_test[round_number] = round_err
        print predicted[:20]
        print y[:20]
        name = 'q1'
        directory = '/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks'
        path = os.path.join(directory, name + 'hw4errors.pdf')
        tterrpath = os.path.join(directory, name + 'hw4_errors_test_train.pdf')
        print path
        plt.Errors([adaboost.local_errors]).plot_all_errors(path)
        plt.Errors([adaboost.adaboost_error, adaboost.adaboost_error_test]).plot_all_errors(tterrpath)
        roc = plt.ROC()
        #roc.add_tpr_fpr_arrays(adaboost.tpr.values(), adaboost.fpr.values())
        get_tpr_fpr(adaboost, roc, X_test, y_test, 30)
        roc.plot_ROC(os.path.join(directory, name + 'hw4_roc.pdf'))

예제 #4

파일 보기

파일: hw3.py 프로젝트: alliemacleay/MachineLearning_CS6140

def q2_plots():
    models = ['Bernoulli', 'Gaussian', '4-bins', '9-bins']
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    k = 10
    num_points = 50
    k_folds = hw3.partition_folds(spamData, k)
    for model_type in range(4):
        roc = ROC.ROC()
        print '\nModel: {}'.format(models[model_type])
        train_acc_sum = 0
        nb_models = []
        for ki in [0]:
            alpha = .001 if model_type==0 else 0
            nb_model = nb.NaiveBayes(model_type, alpha=alpha)
            truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[ki])
            nb_model.train(data_rows, truth_rows)
            for ti in range(num_points + 2):
                theta = ti * 1./(num_points + 1)
                predict = nb_model.predict(data_rows, theta)
                print predict
                accuracy = hw3.get_accuracy(predict, truth_rows)
                train_acc_sum += accuracy
                roc.add_tp_tn(predict, truth_rows, theta)

                #print_plot_output(ki, accuracy, theta)

        roc.plot_ROC('/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks/roc_{}.pdf'.format(model_type))
        roc.print_info()

예제 #5

파일 보기

파일: hw3.py 프로젝트: alliemacleay/MachineLearning_CS6140

def q2():
    models = ['Bernoulli', 'Gaussian', '4-bins', '9-bins']
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    k = 10
    k_folds = hw3.partition_folds(spamData, k)
    for model_type in range(4):
        print '\nModel: {}'.format(models[model_type])
        train_acc_sum = 0
        nb_models = []
        for ki in range(k - 1):
            alpha = .001 if model_type==0 else 0
            nb_model = nb.NaiveBayes(model_type, alpha=alpha)
            truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[ki])
            nb_model.train(data_rows, truth_rows)
            predict = nb_model.predict(data_rows)
            print predict
            accuracy = hw3.get_accuracy(predict, truth_rows)
            train_acc_sum += accuracy
            print_output(ki, accuracy)
            nb_models.append(nb_model)
        nb_combined = nb.NaiveBayes(model_type, alpha=.001)
        if model_type < 2:
            nb_combined.aggregate_model(nb_models)
        else:
            nb_combined.aggregate_model3(nb_models)
        truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[k - 1])
        test_predict = nb_combined.predict(data_rows)
        test_accuracy = hw3.get_accuracy(test_predict, truth_rows)
        print_test_output(test_accuracy, float(train_acc_sum)/(k-1))



            #print len(k_folds[0])
    truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(spamData)

예제 #6

파일 보기

파일: hw3.py 프로젝트: alliemacleay/MachineLearning_CS6140

def q1():
    """GDA """
    """Run the Gaussian Discriminant Analysis on the spambase data. Use the k-folds from the previous problem (1 for testing, k-1 for training, for each fold)
Since you have 57 real value features, each of the  2gaussians (for + class and for - class) will have a mean  vector with 57 components, and a they will have
either a common (shared) covariance matrix size 57x57. This covariance is estimated from all training data (both classes)
or two separate covariance 57x57 matrices (estimated separately for each class)
(you can use a Matlab or Python of Java built in function to estimated covariance matrices, but the estimator is easy to code up).
Looking at the training and testing performance, does it appear that the gaussian assumption (normal distributed data) holds for this particular dataset?
"""

    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())  # returns an array of arrays - this is by row
    k = 10
    train_acc_sum = 0
    k_folds = hw3.partition_folds(spamData, k)
    gdas = []
    for ki in range(k - 1):
        subset = []
        gda = hw3.GDA()
        X, truth = hw3.separate_X_and_y(k_folds[ki])
        covariance_matrix = hw3.get_covar(X)
        gda.p_y = float(sum(truth)) / len(truth)
        gda.train(X, covariance_matrix, truth)
        predictions = gda.predict(X)
        #print predictions
        accuracy = mystats.get_error(predictions, truth, True)
        #gdas.append(gda)
        print_output(ki, accuracy)
        #print gda.prob
        gdas.append(gda)

예제 #7

파일 보기

파일: hw3_tests.py 프로젝트: alliemacleay/MachineLearning_CS6140

def testTransposeArray():
    dfup = hw3.load_and_normalize_spambase()
    cols = dfup.columns[0:3]
    sub = utils.train_subset(dfup, cols, 5)
    up = hw3.pandas_to_data(sub)
    print up
    trans = hw3.transpose_array(up)
    print trans

예제 #8

파일 보기

파일: hw4.py 프로젝트: alliemacleay/MachineLearning_CS6140

def q6():
    """ Bagging - sample with replacement """
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    y, X = hw4.split_truth_from_data(spamData)
    bagged = bag.Bagging(max_rounds=100, sample_size=1000, learner=lambda: DecisionTreeClassifier(max_depth=3))
    bagged.fit(X, y)
    kf_fold = hw4.partition_folds(spamData, .4)
    test_y, test_X = hw4.split_truth_from_data(kf_fold[0])
    test_pred = bagged.predict(test_X)
    test_y = bagged._check_y(test_y)
    test_pred = bagged._check_y(test_pred)
    test_error = float(sum([0 if py == ty else 1 for py, ty in zip(test_pred, test_y)]))/len(test_y)
    print 'Final testing error: {}'.format(test_error)

예제 #9

파일 보기

파일: hw4.py 프로젝트: alliemacleay/MachineLearning_CS6140

def q3():
    """Run your code from PB1 on Spambase dataset to perform Active Learning.
    Specifically:
    - start with a training set of about 5% of the data (selected randomly)
    - iterate M episodes: train the Adaboost for T rounds; from the datapoints
      not in the training set, select the 2% ones that are closest to the
      separation surface (boosting score F(x) closest to ) and add these to the
      training set (with labels). Repeat until the size of the training set
      reaches 50% of the data.

    How is the performance improving with the training set increase? Compare the
    performance of the Adaboost algorithm on the c% randomly selected training set
    with c% actively-built training set for several values of c : 5, 10, 15, 20,
    30, 50.
    """
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    percent = .05
    all_folds = hw4.partition_folds_q4(spamData, percent)
    kf_train = all_folds[0]
    kf_test = all_folds[1]
    left_over = all_folds[2]

    while len(kf_train) < len(spamData)/2:
        y, X = hw4.split_truth_from_data(kf_train)
        y_test, X_test = hw4.split_truth_from_data(kf_test)
        adaboost = run_adaboost(X, y, X_test, y_test, 'q2_crx')

        yt_pred = adaboost.predict(X_test)
        order = adaboost.rank(X_test)
        yt_pred = adaboost._check_y(yt_pred)
        y_test = adaboost._check_y(y_test)
        round_err = float(np.sum([1 if yt!=yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test)

        print 'Error {}'.format(round_err)
        shift_number = int(len(order) * .02)  # number of items to switch into training set
        mask = []
        for i in xrange(shift_number):
            mask.append(order[i])
            kf_train.append(kf_test[order[i]])
        new_test = [kf_test[i] for i in range(len(kf_test)) if i not in mask]
        for i in xrange(len(mask)):
            new_test.append(left_over[i])
        left_over = left_over[len(mask):]
        kf_test = new_test[:]
        print 'test len {} train len {} leftover len {} shifting {}'.format(len(kf_test), len(kf_train), len(left_over), shift_number)

예제 #10

파일 보기

파일: hw4_tests.py 프로젝트: alliemacleay/MachineLearning_CS6140

def spamData():
    return hw3.pandas_to_data(hw3.load_and_normalize_spambase())