Ejemplo n.º 1
0
def classify(data, labels):
    method = sys.argv[1].upper()

    method_name = {'SVM': 'SVM', 'RF': 'Random Forest'}
    test_size_arr = [0.6, 0.5, 0.4]

    print '方法:%s' % method_name[method]
    for test_size in test_size_arr:
        if (method == 'SVM'):
            train_score, test_score = svm.run(data, labels, test_size)
        elif (method == 'RF'):
            train_score, test_score = rf.run(data, labels, test_size)
        else:
            train_score, test_score = svm.run(data, labels, test_size)

        print '============================================='
        print '训练集 %s | 测试集 %s' % (1 - test_size, test_size)
        print '训练集正确率:%s' % train_score
        print '测试集正确率:%s' % test_score
Ejemplo n.º 2
0
    def runSVM(self):
        # print("--------TRAINING--------")
        if self.fileName != "":
            self.splitSize = int(self.split_lineEdit.text())
            self.kernelType = self.kernel_cb.currentText()
            self.degree = int(self.degree_lineEdit.text())
            self.regParam = float(self.regParam_lineEdit.text())
            self.tol = float(self.tol_lineEdit.text())
            if self.splitSize <=40:
                # print("test size =",self.splitSize,"%")
                # print("kernel:",self.kernelType)
                # print("degree:",self.degree)
                # print("tolerance value:",self.tol)
                # print("regularization parameter:",self.regParam)
                self.results = svm.run(self.fileName,self.splitSize,self.kernelType,self.degree,self.tol,self.regParam)
            else:
                pass# print("cannot train on such small dataset")
        else: 
            pass# print("incorrect file name!")            
        # print("--------SUCCESSFUL--------")
        

        QMessageBox.about(self,"Results:", self.results)
Ejemplo n.º 3
0
def run(k, X, y, gamma):
    # Your code goes here return z
    n = np.shape(X)[0]
    d = np.shape(X)[1]

    y_pred = np.zeros(shape=(n, 1))

    for i in range(k):
        # boundaries for each fold
        lower = math.floor(n * i / k)
        upper = math.floor((n * (i + 1) / k) - 1)

        # testing set
        T = list(range(lower, upper + 1))

        # training set
        S = [*range(lower), *range(upper + 1, n)]

        X_train = np.zeros(shape=(len(S), d))
        Y_train = np.zeros(shape=(len(S), 1))

        # fill up X_train and Y_train according to training set
        for j in range(len(S)):
            row = S[j]
            X_train[j] = X[row]
            Y_train[j] = y[row]

        # training
        clf = svm.run(X_train, Y_train.reshape((len(S))), gamma)

        # predicting labels
        for t in T:
            test_point = X[t].reshape((1, d))
            y_pred[t] = svmpredict.run(clf, test_point)

    error = np.mean(y != y_pred)
    return error
Ejemplo n.º 4
0
def result():
    sns.set(palette="muted", color_codes=True)
    # 3a
    print '============================================='
    print 'HW1-3a'
    print '============================================='
    # fittingGaussian('average_age')
    normalTest(df['average_age'].values, name='Col[7]')

    # 3b
    print '============================================='
    print 'HW1-3b'
    print '============================================='
    arr_7 = grouping('average_age')
    for i in range(0, 5):
        normalTest(arr_7[i], name='Col[7] in Group %s' % (i + 1))

    print '---------------------------------------------'
    leveneTest(arr_7, name='Col[7]')

    # 3c
    print '============================================='
    print 'HW1-3c'
    print '============================================='
    oneway(arr_7, name='Col[7]')

    # 4
    print '============================================='
    print 'HW1-4'
    print '============================================='
    selectCol = ['message_number', 'variance_age', 'conversation_number']

    for key in selectCol:
        normalTest(df[key].values, name='Col[%s]' % key)

    print '---------------------------------------------'

    for key in selectCol:
        normalTest(np.log(df[key].values), name='log Col[%s]' % key)

    # 5b
    print '============================================='
    print 'HW1-5'
    print '============================================='

    # no box-cox
    for key in selectCol:
        arr = grouping(key)
        oneway(arr, 'Col[%s]' % key)

    print '---------------------------------------------'

    # box-cox
    for key in selectCol:
        arr = grouping(key, mode=True)
        oneway(arr, 'box-cox Col[%s]' % key)

    # 6
    print '============================================='
    print 'HW1-5'
    print '============================================='
    testSizeArr = [0.6, 0.5, 0.4]

    print "SVM ['average_age', 'variance_age']"
    for size in testSizeArr:
        train_score, test_score = svm.run(df[['average_age', 'variance_age']],
                                          df['group_category'], size)

        print '[train:test = %s:%s] train correct=%s,test correct=%s.' % (
            1 - size, size, train_score, test_score)

    print '---------------------------------------------'

    print "Random Forest ['average_age', 'variance_age']"
    for size in testSizeArr:
        train_score, test_score = rf.run(df[['average_age', 'variance_age']],
                                         df['group_category'], size)

        print '[train:test = %s:%s] train correct=%s,test correct=%s.' % (
            1 - size, size, train_score, test_score)
if __name__ == "__main__":
    loan_2 = 'data/loan_2.csv'
    sourceData = np.loadtxt(loan_2,
                            dtype=float,
                            delimiter=',',
                            converters={0: loan_type},
                            skiprows=1)

    # x是数据 y是标签
    y, x = np.split(sourceData, (1, ), axis=1)

    method = sys.argv[1].upper()

    method_name = {'SVM': 'SVM', 'RF': 'Random Forest'}
    test_size_arr = [0.6, 0.5, 0.4]

    print '方法:%s' % method_name[method]
    for test_size in test_size_arr:
        if (method == 'SVM'):
            train_score, test_score = svm.run(x, y, test_size)
        elif (method == 'RF'):
            train_score, test_score = rf.run(x, y, test_size)
        else:
            train_score, test_score = svm.run(x, y, test_size)

        print '============================================='
        print '训练集 %s | 测试集 %s' % (1 - test_size, test_size)
        print '训练集正确率:%s' % train_score
        print '测试集正确率:%s' % test_score
Ejemplo n.º 6
0
def run(arg_model,
        arg_modelname,
        arg_train,
        arg_test,
        arg_features,
        arg_featurename,
        arg_name,
        arg_preprocess,
        arg_labels,
        arg_dev=True,
        arg_hyperopt=False,
        arg_dataset="mds",
        arg_n_feats=398,
        arg_anova="f_classif",
        arg_nodes=297,
        dataloc="/u/sjeblee/research/va/data/datasets",
        arg_rebalance=""):

    dataloc = dataloc + "/" + arg_dataset
    trainname = arg_train + "_cat"  # all, adult, child, or neonate
    devname = arg_test + "_cat"
    pre = arg_preprocess
    print "pre: " + pre
    labels = arg_labels
    featureset = arg_featurename  # Name of the feature set for feature file
    features = arg_features  # type, checklist, narr_bow, narr_tfidf, narr_count, narr_vec, kw_bow, kw_tfidf, symp_train
    modeltype = arg_model  # svm, knn, nn, lstm, nb, rf
    modelname = arg_modelname
    #resultsloc_name = arg_name

    # Location of data files
    resultsloc = "/u/sjeblee/research/va/data/" + arg_name
    heideldir = "/u/sjeblee/tools/heideltime/heideltime-standalone"
    scriptdir = "/u/sjeblee/research/va/git/verbal-autopsy"

    # Setup
    if not os.path.exists(resultsloc):
        os.mkdir(resultsloc)
    trainset = dataloc + "/train_" + trainname + ".xml"
    devset = ""
    devfeatures = ""
    devresults = ""
    if arg_dev:
        devset = dataloc + "/dev_" + devname + ".xml"
        devfeatures = dataloc + "/dev_" + devname + ".features." + featureset
        devresults = resultsloc + "/dev_" + devname + ".results." + modelname + "." + featureset
    else:
        devset = dataloc + "/test_" + devname + ".xml"
        devfeatures = dataloc + "/test_" + devname + ".features." + featureset
        devresults = resultsloc + "/test_" + devname + ".results." + modelname + "." + featureset
    trainfeatures = dataloc + "/train_" + trainname + ".features." + featureset
    element = "narrative"

    # Preprocessing
    spname = "spell"
    print "Preprocessing..."
    if "spell" in pre:
        print "Running spelling correction..."
        trainsp = dataloc + "/train_" + trainname + "_" + spname + ".xml"
        devsp = ""
        if arg_dev:
            devsp = dataloc + "/dev_" + devname + "_" + spname + ".xml"
        else:
            devsp = dataloc + "/test_" + devname + "_" + spname + ".xml"
        if not os.path.exists(trainsp):
            print "spellcorrect on train data..."
            spellcorrect.run(trainset, trainsp)
        if not os.path.exists(devsp):
            print "spellcorrect on test data..."
            spellcorrect.run(devset, devsp)

        trainset = trainsp
        devset = devsp
        devname = devname + "_" + spname
        trainname = trainname + "_" + spname

    if "heidel" in pre:
        print "Running Heideltime..."
        with cd(heideldir):
            trainh = dataloc + "/train_" + trainname + "_ht.xml"
            if not os.path.exists(trainh):
                heidel_tag.run(trainset, trainh)
                fixtags(trainh)
            trainset = trainh
            devh = ""
            if arg_dev:
                devh = dataloc + "/dev_" + devname + "_ht.xml"
            else:
                devh = dataloc + "/test_" + devname + "_ht.xml"
            if not os.path.exists(devh):
                heidel_tag.run(devset, devh)
                fixtags(devh)
            devset = devh
        devname = devname + "_ht"
        trainname = trainname + "_ht"

    if "medttk" in pre:
        print "Running medttk..."
        #with cd(heideldir):
        trainh = dataloc + "/train_" + trainname + "_medttk.xml"
        if not os.path.exists(trainh):
            medttk_tag.run(trainset, trainh)
            fixtags(trainh)
        trainset = trainh
        devh = ""
        if arg_dev:
            devh = dataloc + "/dev_" + devname + "_medttk.xml"
        else:
            devh = dataloc + "/test_" + devname + "_medttk.xml"
        if not os.path.exists(devh):
            medttk_tag.run(devset, devh)
            fixtags(devh)
        devset = devh
        devname = devname + "_medttk"
        trainname = trainname + "_medttk"
        element = "narr_medttk"

    if "symp" in pre:
        print "Tagging symptoms..."
        sympname = "symp"
        trainsp = dataloc + "/train_" + trainname + "_" + sympname + ".xml"
        #devsp = ""
        #if arg_dev:
        #    devsp = dataloc + "/dev_" + devname + "_" + sympname + ".xml"
        #else:
        #    devsp = dataloc + "/test_" + devname + "_" + sympname + ".xml"
        if not os.path.exists(trainsp):
            print "tag_symptoms on train data..."
            tag_symptoms.run(trainset, trainsp)
            #fixtags(trainsp)
        #if not os.path.exists(devsp):
        #    print "tag_symptoms on test data..."
        #    tag_symptoms.run(devset, devsp)
        #    fixtags(devsp)

        trainset = trainsp
        #devset = devsp
        #devname = devname + "_" + spname
        trainname = trainname + "_" + spname

    # Feature Extraction
    if arg_dev:
        devset = dataloc + "/dev_" + devname + ".xml"
        devfeatures = dataloc + "/dev_" + devname + ".features." + featureset
        devresults = resultsloc + "/dev_" + devname + ".results." + modelname + "." + featureset
    else:
        devset = dataloc + "/test_" + devname + ".xml"
        devfeatures = dataloc + "/test_" + devname + ".features." + featureset
        devresults = resultsloc + "/test_" + devname + ".results." + modelname + "." + featureset
    trainfeatures = dataloc + "/train_" + trainname + ".features." + featureset
    print "trainfeatures: " + trainfeatures
    print "devfeatures: " + devfeatures
    stem = False
    lemma = False
    if "stem" in pre:
        stem = True
    if "lemma" in pre:
        lemma = True
    print "stem: " + str(stem) + " lemma: " + str(lemma)
    if not (os.path.exists(trainfeatures) and os.path.exists(devfeatures)):
        print "Extracting features..."
        extract_features_dirichlet.run(trainset, trainfeatures, devset,
                                       devfeatures, features, labels, stem,
                                       lemma, element)

    # Rebalance dataset?
    if arg_rebalance != "":
        rebalancedfeatures = trainfeatures + "." + arg_rebalance
        rebalance.run(trainfeatures, rebalancedfeatures, labels, arg_rebalance)
        trainfeatures = rebalancedfeatures

    # Model
    if arg_hyperopt:
        print "Running hyperopt..."
        model_dirichlet.hyperopt(modeltype, trainfeatures, devfeatures,
                                 devresults, resultsloc, labels)
    else:
        print "Creating model..."
        if modeltype == "nb":
            svm.run(modeltype, trainfeatures, devfeatures, devresults, labels)
        #elif modeltype == "cnn":
        #    model_temp.run(modeltype, modelname, trainfeatures, devfeatures, devresults, resultsloc, labels, arg_n_feats, arg_anova, arg_nodes)
        else:
            model_dirichlet.run(modeltype, modelname, trainfeatures,
                                devfeatures, devresults, resultsloc, labels,
                                arg_n_feats, arg_anova, arg_nodes)

        # Results statistics
        print "Calculating scores..."
        results_stats.run(devresults, devresults + ".stats")

    print "Done"
Ejemplo n.º 7
0
    dataTypes = ['los', 'heart', 'adult']
    dataTypes = ['los', 'heart', 'adult']
    dataTypes = ['Heart', 'Adult']
    # dataTypes = ['heart']
    # dataTypes = ['adult']
    scores = []
    for dataType in dataTypes:
        print(dataType)

        scores.append(runAll(dataType))

        print('DT')
        dt.run(dataType)

        print('Neural Network')
        network.run(dataType)

        print('Boosted')
        boosted.run(dataType)

        print('SVM')
        svm.run(dataType)

        print('KNN')
        knn.run(dataType)
    endTime = time()
    print(str(endTime - startTime))

    print(scores)
Ejemplo n.º 8
0
    alpha, iterr = kerperceptron.run(5, X_train, y_train)

    #make prediction for each test point
    index = 0
    for t in X_test:
        y_pred_perceptron[index] = kerpredict.run(alpha, X_train, y_train, t)
        index += 1

    perceptron_err = np.mean(y_test != y_pred_perceptron)
    perceptron_error.append(perceptron_err)

    #svm
    (training_length, features) = np.shape(X_train)

    #training svm
    clf = svm.run(X_train, y_train.reshape((training_length)), 0.001)
    #testing svm
    y_pred_svm = svmpredict.run(clf, X_test)
    y_pred_svm = y_pred_svm.reshape((test_length, 1))

    svm_err = np.mean(y_test != y_pred_svm)
    svm_error.append(svm_err)

print(svm_error)
print(perceptron_error)

# plotting data
fig, axs = plt.subplots(2)
fig.suptitle('Error Rate vs. Sample Size')
axs[0].plot(subset, svm_error)
axs[0].set_ylabel("SVM Error Rate")