class ExtraTreeClassifier(Classifier):
	
	def __init__(self, matrixdatabase):
		self._matrix_database = matrixdatabase
		self._has_fit = False
		self._etc = ETC()

	def learn(self, ingredients, cuisine):
		return

	def classify(self, ingredients):
		if not self._has_fit:
			matrix, classes = self._matrix_database.make_train_matrix()
			self._etc = self._etc.fit(matrix, classes)
			print 'Fitting complete...'
			self._has_fit = True
		output = self._etc.predict(self._matrix_database.make_row_from_recipe(ingredients))
		return output[0]
def myclassify(numfiers=5,xtrain=xtrain,ytrain=ytrain,xtest=xtest,ytest=ytest):
    count = 0



    bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
    bagging2.fit(xtrain,ytrain)
    #print bagging2.score(xtest,ytest)
    count += 1
    classifiers = [bagging2.score(xtest,ytest)]

    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        #print tree2.fit(xtrain,ytrain)
        #print tree2.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree2.score(xtest,ytest))
        print "1"
        print tree2.score(xtest,ytest)

    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging1.score(xtest,ytest))
        print "2"
        print bagging1.score(xtest,ytest)

#     if count < numfiers:
#         # votingClassifiers combine completely different machine learning classifiers and use a majority vote
#         clff1 = SVC()
#         clff2 = RFC(bootstrap=False)
#         clff3 = ETC()
#         clff4 = neighbors.KNeighborsClassifier()
#         clff5 = quadda()
#         print"3"


#         eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
#         eclf = eclf.fit(xtrain,ytrain)
#         #print(eclf.score(xtest,ytest))
#         # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
#         #     cla
#         #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
#         #     print ()
#         count+=1
#         classifiers = np.append(classifiers,eclf.score(xtest,ytest))


#     if count < numfiers:
#         svc1 = SVC()
#         svc1.fit(xtrain,ytrain)
#         dec = svc1.score(xtest,ytest)
#         count+=1
#         classifiers = np.append(classifiers,svc1.score(xtest,ytest))
#         print "3"

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,qda.score(xtest,ytest))
        print "4"


    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        #print tree1.fit(xtrain,ytrain)
        #print tree1.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree1.score(xtest,ytest))

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        #print(knn1.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn1.score(xtest,ytest))

    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        #print(lda.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,lda.score(xtest,ytest))

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        #print tree3.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree3.score(xtest,ytest))

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        #print bagging3.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging3.score(xtest,ytest))


    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        #print bagging4.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging4.score(xtest,ytest))

    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        #print tree4.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree4.score(xtest,ytest))

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        #print(tree6.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,tree6.score(xtest,ytest))

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        #print(knn2.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn2.score(xtest,ytest))

    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        #print(knn3.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn3.score(xtest,ytest))

    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        #print(knn4.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn4.score(xtest,ytest))

    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        #print(knn5.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn5.score(xtest,ytest))

    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        #print (ncc1.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,ncc1.score(xtest,ytest))

    if count < numfiers:
    # Nearest shrunken Centroid
        for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]:
            ncc2 = NearestCentroid(shrink_threshold = shrinkage)
            ncc2.fit(xtrain,ytrain)
            #print(ncc2.score(xtest,ytest))

        count+=1
        classifiers = np.append(classifiers,ncc2.score(xtest,ytest))

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        #print(tree5.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,tree5.score(xtest,ytest))

    classifierlabel = ["BaggingETC (with bootstraps set to false)","ETC","BaggingETC","Voting Classifier","svm","QDA","DTC","KNN (default)","LDA","RFC",
                       "BaggingRFC (with bootstraps set to false)","BaggingSVC (with bootstraps set to false)","RFC (bootstrap false)","GBC",
                        "knn (n_neighbors = 10)","knn (n_neighbors = 3)","knn (ball tree algorithm)","knn (kd_tree algorithm)",
                       "Nearest Centroid","Shrunken Centroid?","ABC"]


    classifierlabel = classifierlabel[:len(classifiers)]
    #print len(classifiers)
    #print classifiers
    for i in range(len(classifiers)):


        print ("{} classifier has percent correct {}".format(classifierlabel[i],classifiers[i]))
# In[21]:

### TREESSSSS
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier as DTC
tree1 = DTC()
print tree1
tree1.fit(xtrain,ytrain1)
print tree1.fit(xtrain,ytrain1)
print tree1.score(xtest,ytest1)


# In[22]:

from sklearn.tree import ExtraTreeClassifier as ETC
tree2 = ETC()
print tree2
tree2.fit(xtrain,ytrain1)
print tree2.fit(xtrain,ytrain1)
print tree2.score(xtest,ytest1)


# In[23]:

from sklearn.ensemble import BaggingClassifier
bagging1 = BaggingClassifier(ETC())
bagging1.fit(xtrain,ytrain1)
print bagging1.score(xtest,ytest1)


# In[24]:
def build_separate_tree(X,y,max_features,max_depth,min_samples_split):
	clf = ExtraTreeClassifier(max_features=max_features,max_depth=max_depth,min_samples_split=min_samples_split)
	clf = clf.fit(X,y)
	return clf
	def __init__(self, matrixdatabase):
		self._matrix_database = matrixdatabase
		self._has_fit = False
		self._etc = ETC()
Ejemplo n.º 6
0
def myclassify_AudPow(numfiers,xtrain_1,xtrain_2,ytrain_1,ytrain_2,xtest):

    # remove NaN, Inf, and -Inf values from the xtest feature matrix
    xtest = xtest[~np.isnan(xtest).any(axis=1),:]
    xtest = xtest[~np.isinf(xtest).any(axis=1),:]

    xtrain = np.append(xtrain_1,xtrain_2,0)
    ytrain = np.append(ytrain_1,ytrain_2)
    ytrain = np.ravel(ytrain)
    xtrunclength = sio.loadmat('../Files/xtrunclength.mat')
    xtrunclength = xtrunclength['xtrunclength'][0]



    #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector
    count = 0
    # print numfiers

    predictionMat = np.empty((xtest.shape[0],numfiers))
    predictionStringMat = []
    finalPredMat = []

    bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
    bagging2.fit(xtrain,ytrain)
    #print bagging2.score(xtest,ytest)
    ytest = bagging2.predict(xtest)
    predictionMat[:,count] = ytest
    count += 1


    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        ytest = tree2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        ytest = bagging1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # votingClassifiers combine completely different machine learning classifiers and use a majority vote
        clff1 = SVC()
        clff2 = RFC(bootstrap=False)
        clff3 = ETC()
        clff4 = neighbors.KNeighborsClassifier()
        clff5 = quadda()



        eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
        eclf = eclf.fit(xtrain,ytrain)
        #print(eclf.score(xtest,ytest))
        # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
        #     cla
        #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
        #     print ()
        ytest = eclf.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        svc1 = SVC()
        svc1.fit(xtrain,ytrain)
        ytest = svc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        ytest = qda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        ytest = tree1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        ytest = knn1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        ytest = lda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        ytest = tree3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        ytest = bagging3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        ytest = bagging4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        ytest = tree4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        ytest = tree6.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        ytest = knn2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        ytest = knn3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        ytest = knn4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        ytest = knn5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        ytest = ncc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        ytest = tree5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    for colCount in range(predictionMat.shape[1]):
        tempCol = predictionMat[:,colCount]
        modeCol = predWindowVecModeFinder(tempCol,xtrunclength)
        modeStr = predVec2Str(modeCol)
        predictionStringMat.append(modeStr)
        finalPredMat += map(int,modeCol)

    return predictionStringMat,finalPredMat
Ejemplo n.º 7
0
def classify(topicmodel, plotconfusionmatrix=False, multilabel=False):
    """ Method takes feature vectors (including topic model) and class labels as arrays, and trains and tests a number of classifiers on them. Outputs classifier scores and confusion matrices."""

    names = [  #"Dummy",
        "Logistic Regression", "Nearest Neighbors", "Linear SVM", "RBF SVM",
        "Gaussian Process", "Decision Tree", "Random Forest", "Neural Net",
        "AdaBoost", "Naive Bayes"
    ]

    classifiers = [
        #DummyClassifier(strategy='most_frequent',random_state=10),
        LogisticRegression(C=1e5, multi_class="ovr"),
        KNeighborsClassifier(5),
        SVC(kernel="linear", C=0.025),
        SVC(kernel='rbf', gamma=2, C=1),
        GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        MLPClassifier(alpha=1),
        AdaBoostClassifier(),
        GaussianNB()
    ]

    multinames = [
        "Logistic Regression", 'MLkNN', 'Decision Tree', 'Extra Tree', 'KNN',
        'Neural Net', 'Random Forest', 'Naive Bayes', "RBF SVM", "Linear SVM"
    ]
    multiclassifiers = [
        LabelPowerset(LogisticRegression(C=1e5)),
        MLkNN(k=5, s=1.0, ignore_first_neighbours=0),
        LabelPowerset(DecisionTreeClassifier(max_depth=5)),
        LabelPowerset(ExtraTreeClassifier(max_depth=5)),
        LabelPowerset(KNeighborsClassifier(5)),
        LabelPowerset(MLPClassifier(alpha=1)),
        LabelPowerset(
            RandomForestClassifier(max_depth=5,
                                   n_estimators=10,
                                   max_features=1)),
        LabelPowerset(GaussianNB()),
        LabelPowerset(SVC(kernel='rbf', gamma=2, C=1)),
        LabelPowerset(SVC(kernel="linear", C=0.025)),

        #RidgeClassifierCV()
        #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True, multi_class= "one_vs_rest")
    ]

    measurements = topicmodel[2]
    vec = DictVectorizer()
    X = vec.fit_transform(measurements).toarray()
    print(vec.get_feature_names())
    #print(X)
    classlabels = topicmodel[1]
    Y = (classlabels
         if multilabel == False else ToIndicatorMatrix(classlabels))
    #print(X)
    #print(y)

    # X_train, X_test, y_train, y_test = \
    #     train_test_split(X, y, test_size=.2, random_state=42)
    classes = (list(set(classlabels)) if multilabel == False else list(
        set([classe for sublist in classlabels for classe in sublist])))

    #Number of cross validations
    cvn = 5
    print(('\n Start multi-label classification!'
           if multilabel else 'Start single-label classification!'))
    print('Labels (classes):')
    print(classes)

    #see https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/
    #http://scikit.ml/api/index.html
    print('\n Results of the model evaluation: \n')

    scores = [
        'accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'
    ]

    #Naive model (majority vote)
    clf = DummyClassifier(strategy='most_frequent', random_state=10)
    if multilabel:
        clf = LabelPowerset(clf)
    dummyscores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[0])
    dummyprescores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[1])
    dummyrescores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[2])
    dummyfescores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[3])
    print(
        "\n {}-CV naive classifier (most frequent class): {}: {} (+/- {}), {}: {}, {}: {}, {}: {}"
        .format(cvn, scores[0], dummyscores.mean(), dummyscores.std(),
                scores[1], dummyprescores.mean(), scores[2],
                dummyrescores.mean(), scores[3], dummyfescores.mean()))
    y_pred = clf.fit(X, Y).predict(X)
    if multilabel == False:
        print("Fitting on entire dataset (no CV):")
        cnf_matrix = metrics.confusion_matrix(Y, y_pred, labels=classes)
        print(cnf_matrix)
        print(metrics.classification_report(Y, y_pred, labels=classes))
    else:
        myscores = myCVAScore(clf, X, Y, cvn)
        print(
            "\n {}-CV naive classifier (my own) {}: {}, {}: {}, {}: {}, {}: {}, {}: {}, {}: {}, {}: {}"
            .format(cvn, scores[0], myscores[0], scores[1], myscores[1],
                    scores[2], myscores[2], scores[3], myscores[3], 'coverage',
                    myscores[4], 'hamming loss', myscores[5], 'jaccard',
                    myscores[6]))
        print("Fitting on entire dataset (no CV):")
        print('subset accuracy: {}'.format(accuracy_score(y_pred, Y)))

    # iterate over classifiers
    classifiers = (multiclassifiers if multilabel else classifiers)
    names = (multinames if multilabel else names)
    for name, clf in zip(names, classifiers):
        #standard scores given by sklearn
        accscores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[0])
        pscores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[1])
        rscores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[2])
        fscores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[3])
        print("\n {}-CV  {}: {}: {} (+/- {}), {}: {}, {}: {}, {}: {}".format(
            cvn, name, scores[0], accscores.mean(), accscores.std(), scores[1],
            pscores.mean(), scores[2], rscores.mean(), scores[3],
            fscores.mean()))

        clffit = clf.fit(X, Y)
        y_pred = clffit.predict(X)
        if multilabel == False:
            print("Fitting on entire dataset (no CV):")
            cnf_matrix = metrics.confusion_matrix(Y, y_pred, labels=classes)
            print(cnf_matrix)
            print(metrics.classification_report(Y, y_pred, labels=classes))
        else:
            #my own scores for multilabel classification
            myscores = myCVAScore(clf, X, Y, cvn)
            print(
                "\n {}-CV {} (my own): {}: {}, {}: {}, {}: {}, {}: {}, {}: {}, {}: {}, {}: {}"
                .format(cvn, name, scores[0], myscores[0], scores[1],
                        myscores[1], scores[2], myscores[2], scores[3],
                        myscores[3], 'coverage', myscores[4], 'hamming loss',
                        myscores[5], 'jaccard', myscores[6]))
            print("Fitting on entire dataset (no CV):")
            print(' subset accuracy: {}'.format(accuracy_score(y_pred, Y)))

        #print decision tree
        from sklearn import tree
        if name == "Decision Tree" and multilabel == False:
            tree.export_graphviz(clffit,
                                 out_file='tree.dot',
                                 class_names=sorted(classes),
                                 feature_names=vec.get_feature_names())

        # Plot non-normalized confusion matrix
        #plt.figure()
        np.set_printoptions(precision=2)
        if plotconfusionmatrix:
            plot_confusion_matrix(cnf_matrix,
                                  classes=classes,
                                  title='Confusion matrix for ' + name)
def main():

    #dataset = pd.read_csv('good_representations_aug.csv')
    dataset = pd.read_csv('inception_representations_aug.csv')
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    all_objects = [
        "Vase", "Teapot", "Bottle", "Spoon", "Plate", "Mug", "Knife", "Fork",
        "Flask", "Bowl"
    ]
    #The csv has file path for class labels. This code cleans up this path and adds class name
    for index in range(y.size):
        for obj in all_objects:
            if obj in y[index]:
                y[index] = obj
                break

    #Converting labels from strings to integer encoded
    encoder = LabelEncoder()
    y_enc = encoder.fit_transform(y)

    #Feature Scaling before PCA
    sc = StandardScaler()
    X = sc.fit_transform(X)

    #Dimensionality reduction with PCA

    #for GOOD 125 features accounts for 99.9% variance
    #pca = PCA(n_components=230)

    #for inception 500 features accounts for 99.9% variance
    pca = PCA(n_components=500)
    X = pca.fit_transform(X)

    #Dataset is imbalanced. To account for this, we upsample the representations to get new samples
    sample_count = 100

    #The upsampled dataset is split into Dependent and independent variables

    X = upsampled_dataset.iloc[:, :-1].values
    y = upsampled_dataset.iloc[:, -1].values

    # Splitting the dataset into the Training set and Test set
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    ada_base = AdaBoostClassifier()
    ada_deci = AdaBoostClassifier(DecisionTreeClassifier())
    ada_extr = AdaBoostClassifier(ExtraTreeClassifier())
    ada_logr = AdaBoostClassifier(LogisticRegression())
    ada_svml = AdaBoostClassifier(SVC(probability=True, kernel='linear'))

    models = [ada_base, ada_deci, ada_extr, ada_logr, ada_svml]
    model_names = [
        'Base', 'DecisonTree', 'ExtraTree', 'LogisticRegression', 'SVM'
    ]

    s = ['accuracy']
    r = fill_results_df(models, model_names, s, X_train, X_test, y_train,
                        y_test, 10)
    print('Results from untuned classifiers', r)

    # Find best parameters

    base = clone(ada_base)
    ada_base_hyperparameter_tuning(base, X, y)

    deci = clone(ada_deci)
    ada_deci_hyperparameter_tuning(deci, X, y)

    extr = clone(ada_extr)
    ada_extr_hyperparameter_tuning(extr, X, y)

    svml = clone(ada_svml)
    ada_svml_hyperparameter_tuning(svml, X, y)

    logr = clone(ada_logr)
    ada_logr_hyperparameter_tuning(logr, X, y)
Ejemplo n.º 9
0
def do_classification(clm, data_fname, clm_type):
    d_name = "output"
    if os.path.isdir(d_name) is False:
        os.mkdir(d_name)

    fname = os.path.basename(data_fname).replace('.csv', '')
    fn = 'result_' + fname + "_type" + str(clm_type) + "_" +\
        datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '.csv'
    csv_out_fnamee = os.path.join(d_name, fn)
    fi = open(csv_out_fnamee, 'w')
    csv_out = csv.writer(fi, delimiter=',')

    # Create dataframe for training
    base_df = pd.read_csv(data_fname)
    df = base_df[clm]

    df = df[df['heartRate'] > 40]
    df = df[df['skinTemperature'] > 10]
    df = df[df['met'] > 0.4]

    X_train = df[clm[:-2]]
    Y_train = [df[clm[-2]], df[clm[-1]]]

    # Model: Decision Tree
    ML_NAME = 'Decision Tree'
    depth_list = np.concatenate(
        (np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5),
         np.arange(50, 100, 10), np.arange(150, 1000, 50)))
    for t in [0, 1]:
        for depth in depth_list:
            clf = DecisionTreeClassifier(class_weight=None,
                                         criterion='entropy',
                                         max_depth=depth,
                                         max_features=None,
                                         max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort=False,
                                         random_state=None,
                                         splitter='best')
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, depth)

    # Model: Extra Tree Classifier
    ML_NAME = 'Extremely randomized tree classifier'
    for t in [0, 1]:
        clf = ExtraTreeClassifier()
        do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                            fname, 0)

    # Model: Gaussian
    ML_NAME = 'Gaussian Naive Bayes'

    for t in [0, 1]:
        clf = GaussianNB(priors=None)
        do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                            fname, 0)

    # Model: Multivariate Bernoulli Model
    ML_NAME = 'Multivariate Bernoulli Model'
    alphas = np.concatenate(
        (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2),
         np.arange(20, 50, 5), np.arange(50, 150, 10)))

    for t in [0, 1]:
        for a in alphas:
            clf = BernoulliNB(alpha=a)
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, a)

    # Model: AdaBoost Classifier
    ML_NAME = 'AdaBoost classifier'
    noestimator = np.arange(5, 1000, 20)

    for t in [0, 1]:
        for n in noestimator:
            clf = AdaBoostClassifier(algorithm='SAMME.R',
                                     base_estimator=None,
                                     learning_rate=0.1,
                                     n_estimators=n,
                                     random_state=None)
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, n)

    # Model: Gradient Boosting Classifier
    ML_NAME = 'Gradient Boosting Classifier'
    noestimator = np.arange(5, 1000, 20)

    for t in [0, 1]:
        for n in noestimator:
            clf = GradientBoostingClassifier(criterion='friedman_mse',
                                             init=None,
                                             learning_rate=0.1,
                                             loss='deviance',
                                             max_depth=4,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=n,
                                             presort='auto',
                                             random_state=None,
                                             subsample=1.0,
                                             verbose=0,
                                             warm_start=False)
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, n)

    # Model: Random Forest Classifier
    ML_NAME = 'Random Forest Classifier'
    noestimator = np.concatenate(
        (np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5),
         np.arange(50, 150, 10), np.arange(150, 1000, 50)))

    for t in [0, 1]:
        for n in noestimator:
            clf = RandomForestClassifier(n_estimators=n)
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, n)

    # Model: Support Vector Machines - RBF
    ML_NAME = 'Support Vector Machines - RBF'
    c_values = np.concatenate(
        (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2),
         np.arange(20, 50, 5), np.arange(50, 150, 10)))

    for t in [0, 1]:
        for c in c_values:
            clf = SVC(C=c, kernel='rbf')
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, c)

    # Model: Support Vector Machines - poly
    ML_NAME = 'Support Vector Machines - poly'
    c_values = np.concatenate(
        (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2),
         np.arange(20, 50, 5), np.arange(50, 150, 10)))

    for t in [0, 1]:
        for c in c_values:
            clf = SVC(C=c, kernel='poly')
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, c)

    # Model: Support Vector Machines - Sigmoid
    ML_NAME = 'Support Vector Machines - Sigmoid'
    c_values = np.concatenate(
        (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2),
         np.arange(20, 50, 5), np.arange(50, 150, 10)))

    for t in [0, 1]:
        for c in c_values:
            clf = SVC(C=c, kernel='sigmoid')
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, c)

    # Model: Support Vector Machines - Linear
    ML_NAME = 'Support Vector Machines - Linear'
    c_values = np.concatenate(
        (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2),
         np.arange(20, 50, 5), np.arange(50, 150, 10)))

    for t in [0, 1]:
        for c in c_values:
            clf = SVC(C=c, kernel='linear')
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, c)

    # Model: KNeighborsClassifier
    ML_NAME = 'KNeighborsClassifier'
    n_neighbors = np.concatenate(
        (np.arange(1, 10), np.arange(10, 20,
                                     2), np.arange(20, 50,
                                                   5), np.arange(50, 150, 10)))

    for t in [0, 1]:
        for n in n_neighbors:
            try:
                clf = KNeighborsClassifier(n_neighbors=n)
                do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t,
                                    csv_out, fname, n)
            except:
                pass

    # Model: Radius Neighbors Classifier
    ML_NAME = 'Radius Neighbors Classifier'
    n_neighbors = np.concatenate(
        (np.arange(1, 10), np.arange(10, 20,
                                     2), np.arange(20, 50,
                                                   5), np.arange(50, 150, 10)))

    for t in [0, 1]:
        for n in n_neighbors:
            try:
                clf = RadiusNeighborsClassifier(radius=n)
                do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t,
                                    csv_out, fname, n)
            except:
                pass

    # Model: NearestCentroid
    ML_NAME = 'Nearest Centroid Classifier'

    for t in [0, 1]:
        clf = NearestCentroid()
        do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                            fname, 0)

    fi.close()
Ejemplo n.º 10
0
# Creating gini DecisionTree
clf_gini = DecisionTreeClassifier(random_state=100, max_depth=3,
                                  min_samples_leaf=5)
clf_gini.fit(X_train, y_train)          # Training gini tree

# Creating entropy DecisionTree
clf_entropy = DecisionTreeClassifier(criterion='entropy', random_state=100,
                                     max_depth=3, min_samples_leaf=5)
clf_entropy.fit(X_train, y_train)       # Training entropy tree

# Creating SVM with polynomial kernel
clf_svc = svm.SVC(random_state=100, kernel='poly')
clf_svc.fit(X_train, y_train)           # Training SVM

# Extra trees classifier
clf_ext = ExtraTreeClassifier(random_state=100, max_depth=3, min_samples_leaf=5)
clf_ext.fit(X_train, y_train)           # Training extra tree


y_pred_gi = clf_gini.predict(X_test)    # gini tree prediction test
y_pred_en = clf_entropy.predict(X_test) # entropy tree prediction test
y_pred_sv = clf_svc.predict(X_test)     # SVM prediction test
y_pred_et = clf_ext.predict(X_test)     # extra tree prediction test

# Print accuracy scores
print("Gini accuracy score: ", accuracy_score(y_test, y_pred_gi)*100)
print("Entropy accuracy score: ", accuracy_score(y_test, y_pred_en)*100)
print("SVM accuracy score: ", accuracy_score(y_test, y_pred_sv)*100)
print("Extra tree accuracy score: ", accuracy_score(y_test, y_pred_et)*100)
print(y_test)
print(y_pred_sv)
Ejemplo n.º 11
0
# print(multilabel_confusion_matrix(y_test_og, preds))
clear_session()


# SVM  -
classifier = svm.SVC()
classifier.fit(X_train, y_train_og)
preds = classifier.predict(X_test)
print("SVC Accuracy:", accuracy_score(y_test_og, preds))
# print("Confusion Matrix:")
# print(multilabel_confusion_matrix(y_test_og, preds))
clear_session()

# SVM  -
classifier = svm.LinearSVC()
classifier.fit(X_train, y_train_og)
preds = classifier.predict(X_test)
print("LinearSVC Accuracy:", accuracy_score(y_test_og, preds))
# print("Confusion Matrix:")
# print(multilabel_confusion_matrix(y_test_og, preds))
clear_session()

# Extra Tree  -
classifier = ExtraTreeClassifier()
classifier.fit(X_train, y_train_og)
preds = classifier.predict(X_test)
print("ExtraTreeClassifier Accuracy:", accuracy_score(y_test_og, preds))
# print("Confusion Matrix:")
# print(multilabel_confusion_matrix(y_test_og, preds))
clear_session()
Ejemplo n.º 12
0
def ML_model_Geno(args):
    if not os.path.exists(args.outpath+'/ML_Geno'):
        os.system('mkdir '+args.outpath+'/ML_Geno')
    disease_train=np.load(args.outpath+'/train/disease_train.npy')
    Cxlist_NN=[(20,20,20),(30,30),(10,10,10,10),(30,20,10)]
    Cxlist_ada=[DecisionTreeClassifier(),LogisticRegression(),ExtraTreeClassifier(),GaussianNB()]
    Cxlist_GB=[0.0001,0.001,0.01,0.1]
    Cxlist_LR=[0.0001,0.001,0.01,0.1]
    Cxlist_RF=[0.0001,0.001,0.01,0.1]
    slist=[]
    if os.path.exists(args.outpath+'/train/genotype_train_5E3.npy'):
        slist.append(1)
    if os.path.exists(args.outpath+'/train/genotype_train_5E4.npy'):
        slist.append(2)
    if os.path.exists(args.outpath+'/train/genotype_train_5E5.npy'):
        slist.append(3)
    if os.path.exists(args.outpath+'/train/genotype_train_5E6.npy'):
        slist.append(4)

    for s in slist:
        if s==1:
            Geno_train=np.load(args.outpath+'/train/genotype_train_5E3.npy')
            Geno_valid=np.load(args.outpath+'/valid/genotype_valid_5E3.npy')
            Geno_test=np.load(args.outpath+'/test/genotype_test_5E3.npy')
        elif s==2:
            Geno_train=np.load(args.outpath+'/train/genotype_train_5E4.npy')
            Geno_valid=np.load(args.outpath+'/valid/genotype_valid_5E4.npy')
            Geno_test=np.load(args.outpath+'/test/genotype_test_5E4.npy')
        elif s==3:
            Geno_train=np.load(args.outpath+'/train/genotype_train_5E5.npy')
            Geno_valid=np.load(args.outpath+'/valid/genotype_valid_5E5.npy')
            Geno_test=np.load(args.outpath+'/test/genotype_test_5E5.npy')
        else:
            Geno_train=np.load(args.outpath+'/train/genotype_train_5E6.npy')
            Geno_valid=np.load(args.outpath+'/valid/genotype_valid_5E6.npy')
            Geno_test=np.load(args.outpath+'/test/genotype_test_5E6.npy')

        for t in [1,2,3,4]:
            Cx_NN=Cxlist_NN[t-1]
            NN=MLPClassifier(hidden_layer_sizes=Cx_NN,max_iter=1000)
            NN.fit(Geno_train,disease_train)
            Y=NN.predict_proba(Geno_valid)
            np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_NN_valid.npy',Y[:,1])
            Y=NN.predict_proba(Geno_test)
            np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_NN_test.npy',Y[:,1])

            Cx_ada=Cxlist_ada[t-1]
            ada = AdaBoostClassifier(base_estimator=Cx_ada)
            ada.fit(Geno_train,disease_train)
            Y=ada.predict_proba(Geno_valid)
            np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_ada_valid.npy',Y[:,1])
            Y=ada.predict_proba(Geno_test)
            np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_ada_test.npy',Y[:,1])

            Cx_GB=Cxlist_GB[t-1]
            GB = GradientBoostingClassifier(min_impurity_decrease=Cx_GB)
            GB.fit(Geno_train,disease_train)
            Y=GB.predict_proba(Geno_valid)
            np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_GB_valid.npy',Y[:,1])
            Y=GB.predict_proba(Geno_test)
            np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_GB_test.npy',Y[:,1])

            Cx_LR=Cxlist_LR[t-1]
            LR = LogisticRegression(penalty='l1', C=Cx_LR, max_iter=10000)
            LR.fit(Geno_train,disease_train)
            Y=LR.predict_proba(Geno_valid)
            np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_LR_valid.npy',Y[:,1])
            Y=LR.predict_proba(Geno_test)
            np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_LR_test.npy',Y[:,1])

            Cx_RF=Cxlist_RF[t-1]
            RF = RandomForestClassifier(min_impurity_decrease=Cx_RF)
            RF.fit(Geno_train,disease_train)
            Y=RF.predict_proba(Geno_valid)
            np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_RF_valid.npy',Y[:,1])
            Y=RF.predict_proba(Geno_test)
            np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_RF_test.npy',Y[:,1])
def main():

    # Checks for correct number of arguments
    if len(sys.argv) != 3:
        print(
            'usage: ./troll_identifier.py [TRAIN DATASET] [TEST/DEV DATASET]')
        sys.exit()

    # set up dataset
    data_train = pd.read_csv(sys.argv[1])
    data_test = pd.read_csv(sys.argv[2])

    print('train: {}'.format(sys.argv[1]))
    print('test: {}'.format(sys.argv[2]))

    x_train = data_train.drop(
        [data_train.columns[0], data_train.columns[1], data_train.columns[-1]],
        axis=1).apply(pd.to_numeric, errors='ignore')
    y_train = pd.Series(data_train.iloc[:, -1])
    x_test = data_test.drop(
        [data_test.columns[0], data_test.columns[1], data_test.columns[-1]],
        axis=1).apply(pd.to_numeric, errors='ignore')
    y_test = pd.Series(data_test.iloc[:, -1])

    type = input('type: [1: supervised, 2: semi-supervised, 3: unsupervised] ')
    if type == 1:
        method = input('method: [1: classification, 2: regression] ')
        if method == 1:
            classifier = input(
                'classifier: [1: decision tree, 2: extra tree, 3: extra trees, 4: k nearest neighbor, 5: naive bayes, 6: radius neighbors, 7: random forest, 8: support vector machine, 9: gradient boosting, 10: gaussian process, 11: stochastic gradient descent, 12: passive aggressive, 13: nearest centroid, 14: perceptron, 15: multi-layer perceptron, 16: ada boost] '
            )
            if classifier == 1:
                criterion = input('criterion: [1: gini, 2: entropy] ')
                if criterion == 1:
                    print(type, method, classifier, criterion)
                    model = DecisionTreeClassifier(criterion='gini')
                elif criterion == 2:
                    print(type, method, classifier, criterion)
                    model = DecisionTreeClassifier(criterion='entropy')
                else:
                    print('no criterion chosen')
                    exit()
            elif classifier == 2:
                print(type, method, classifier)
                model = ExtraTreeClassifier()
            elif classifier == 3:
                print(type, method, classifier)
                model = ExtraTreesClassifier()
            elif classifier == 4:
                n = input('n: [1: 1, 2: 3: 3: 5] ')
                if n == 1:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=1)
                elif n == 2:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=3)
                elif n == 3:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=5)
                else:
                    print('no n chosen')
                    exit()
            elif classifier == 5:
                version = input(
                    'version: [1: gaussian, 2: bernoulli, 3: multinomial, 4: complement] '
                )
                if version == 1:
                    print(type, method, classifier, version)
                    model = GaussianNB()
                elif version == 2:
                    print(type, method, classifier, version)
                    model = BernoulliNB()
                elif version == 3:
                    print(type, method, classifier, version)
                    model = MultinomialNB()
                elif version == 4:
                    print(type, method, classifier, version)
                    model = ComplementNB()
                else:
                    print('no version chosen')
                    exit()
            elif classifier == 6:
                print(type, method, classifier)
                model = RadiusNeighborsClassifier(radius=1.0)
            elif classifier == 7:
                print(type, method, classifier)
                model = RandomForestClassifier(n_estimators=50, random_state=1)
            elif classifier == 8:
                print(type, method, classifier)
                model = LinearSVC(
                    multi_class='crammer_singer')  #multi_class='ovr'
            elif classifier == 9:
                print(type, method, classifier)
                model = GradientBoostingClassifier()
            elif classifier == 10:
                print(type, method, classifier)
                model = GaussianProcessClassifier(multi_class='one_vs_one')
                # model = GaussianProcessClassifier(multi_class='one_vs_rest')
            elif classifier == 11:
                print(type, method, classifier)
                model = SGDClassifier()
            elif classifier == 12:
                print(type, method, classifier)
                model = PassiveAggressiveClassifier()
            elif classifier == 13:
                print(type, method, classifier)
                model = NearestCentroid()
            elif classifier == 14:
                print(type, method, classifier)
                model = Perceptron(tol=1e-3, random_state=0)
            elif classifier == 15:
                print(type, method, classifier)
                model = MLPClassifier()
            elif classifier == 16:
                print(type, method, classifier)
                model = AdaBoostClassifier(n_estimators=100)
            else:
                print('no classifier chosen')
                exit()
            # train the model using the training sets and check score
            model.fit(x_train, y_train)
            model.score(x_train, y_train)

            # predict output
            predictions = pd.Series(model.predict(x_test))

            filename = '{},{},{}.txt'.format(type, method, classifier)
            with open(filename, 'w') as output:
                output.write('{:10}\t{:10}\t{:10}\t{:10}'.format(
                    'actual', 'predict', 'approximate', 'match?'))
                for i in range(len(predictions)):
                    match = True if (y_test[i] == predictions[i]) else False
                    output.write('{:10}\t{:10}\t{:10}'.format(
                        y_train[i], predictions[i], match))
                output.write('accuracy: {:7.2f}%'.format(
                    100 * accuracy_score(y_test, predictions)))

            print('accuracy: {:7.2f}%'.format(
                100 * accuracy_score(y_test, predictions)))
            print(
                classification_report(
                    y_test,
                    predictions,
                    target_names=['RightTroll', 'LeftTroll', 'Other']))
            print(
                confusion_matrix(y_test,
                                 predictions,
                                 labels=["RightTroll", "LeftTroll", "Other"]))
        elif method == 2:
            # transform into binary classification problem
            # y_train = y_train.apply(lambda x: 0 if x == 'Other' else 1)
            # y_test = y_test.apply(lambda x: 0 if x == 'Other' else 1)

            # transform string labels into integers
            # le = LabelEncoder()
            # le.fit(y_train) # print(le.transform(['LeftTroll', 'Other', 'Other', 'RightTroll'])), print(le.inverse_transform([0, 1, 2, 1]))
            # print(le.classes_)
            #
            # y_train = le.transform(y_train)
            # y_test = le.transform(y_test)

            regressor = input(
                'regressor: [1: linear discriminant analysis, 2: logistic regression, 3: ridge regression, 4: quadratic discriminant analysis, 5: linear regression, 6: decision tree regression, 7: pls regression, 8: pls canonical, 9: canonical correlation analysis, 10: lasso, 11: multi-task lasso, 12: elastic net, 13: multi-task elastic net, 14: least angle regression, 15: least angle regression lasso, 16: orthogonal matching pursuit, 17: bayesian ridge, 18: automatic relevence determination, 19: theil sen regression, 20: huber regressor, 21: random sample consensus] '
            )
            if regressor == 1:
                print(type, method, regressor)
                model = LinearDiscriminantAnalysis()
            elif regressor == 2:
                print(type, method, regressor)
                model = LogisticRegression(
                    solver='lbfgs', multi_class='multinomial')  #'newton-cg'
            elif regressor == 3:
                print(type, method, regressor)
                model = RidgeClassifier()
            elif regressor == 4:
                print(type, method, regressor)
                model = QuadraticDiscriminantAnalysis()
            elif regressor == 5:
                strategy = input('strategy: [1: one vs rest, 2: one vs one] ')
                if strategy == 1:
                    print(type, method, strategy, regressor)
                    model = OneVsRestClassifier(LinearRegression())
                elif strategy == 2:
                    print(type, method, strategy, regressor)
                    model = OneVsOneClassifier(LinearRegression())
                else:
                    print('no strategy selected')
                    exit()
            elif regressor == 6:
                strategy = input('strategy: [1: one vs rest, 2: one vs one] ')
                if strategy == 1:
                    print(type, method, strategy, regressor)
                    model = OneVsRestClassifier(DecisionTreeRegressor())
                elif strategy == 2:
                    print(type, method, strategy, regressor)
                    model = OneVsOneClassifier(DecisionTreeRegressor())
                else:
                    print('no strategy selected')
                    exit()
            elif regressor == 7:
                print(type, method, regressor)
                model = PLSRegression(n_components=2)
            elif regressor == 8:
                print(type, method, regressor)
                model = PLSCanonical(n_components=2)
            elif regressor == 9:
                print(type, method, regressor)
                model = CCA(n_components=1)
            elif regressor == 10:
                print(type, method, regressor)
                model = Lasso(alpha=0.1)
            elif regressor == 11:
                print(type, method, regressor)
                model = MultiTaskLasso(alpha=0.1)
            elif regressor == 12:
                print(type, method, regressor)
                model = ElasticNet(random_state=0)
            elif regressor == 13:
                print(type, method, regressor)
                model = MultiTaskElasticNet(random_state=0)
            elif regressor == 14:
                print(type, method, regressor)
                model = Lars(n_nonzero_coefs=1)
            elif regressor == 15:
                print(type, method, regressor)
                model = LassoLars(alpha=.1)
            elif regressor == 16:
                print(type, method, regressor)
                model = OrthogonalMatchingPursuit()
            elif regressor == 17:
                print(type, method, regressor)
                model = BayesianRidge()
            elif regressor == 18:
                print(type, method, regressor)
                model = ARDRegression()
            elif regressor == 19:
                print(type, method, regressor)
                model = TheilSenRegressor(random_state=0)
            elif regressor == 20:
                print(type, method, regressor)
                model = HuberRegressor()
            elif regressor == 21:
                print(type, method, regressor)
                model = RANSACRegressor(random_state=0)
            else:
                print('no regressor chosen')
                exit()

            # train the model using the training sets and check score
            model.fit(x_train, y_train)
            model.score(x_train, y_train)

            # print('coefficient:', model.coef_)
            # print('intercept:', model.intercept_)

            # predict output
            predictions = pd.Series(model.predict(x_test))
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

            # calculate accuracy
            numerator = 0.0
            denominator = float(len(predictions))
            for i in range(len(predictions)):
                match = True if (y_test[i] == predictions[i]) else False
                numerator += 1 if match else 0
                print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                                   match))
            print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))

        else:
            print('no method chosen')
            exit()
    elif type == 2:
        classifier = input(
            'classifier: [1: label propagation, 2: label spreading] ')
        if classifier == 1:
            print(type, classifier)
            model = LabelPropagation()
        elif classifier == 2:
            print(type, classifier)
            model = LabelSpreading()
        else:
            print('no classifier chosen')
            exit()
        # train the model using the training sets and check score
        model.fit(x_train, y_train)
        model.score(x_train, y_train)

        # predict output
        predictions = pd.Series(model.predict(x_test))
        print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

        # calculate accuracy
        numerator = 0.0
        denominator = float(len(predictions))
        for i in range(len(predictions)):
            match = True if (y_test[i] == predictions[i]) else False
            numerator += 1 if match else 0
            print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                               match))
        print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))
    elif type == 3:
        method = input(
            'method: [1: clustering, 2: random trees embedding, 3: nearest neighbors] '
        )
        if method == 1:
            clusterer = input('clustere: [1: k means]')
            if clusterer == 1:
                clusters = input('clusters: [1: 1, 2: 2, 3: 3] ')
                if clusters == 1:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=1, random_state=0)
                elif clusters == 2:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=2, random_state=0)
                elif clusters == 3:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=3, random_state=0)
                else:
                    print('no clusters chosen')
                    exit()
            else:
                print('no clusterer chosen')
                exit()
            # train the model using the training sets and check score
            model.fit(x_train)

            # predict output
            predictions = model.predict(x_test)
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

            # check details
            print('centroids: ' + model.cluster_centers_)
            # print('labels: ' + model.labels_)
        elif method == 2:
            model = RandomTreesEmbedding()
            # train the model using the training sets and check score
            model.fit(x_train)

            # predict output
            predictions = model.apply(x_test)
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))
        elif method == 3:
            model = NearestNeighbors(n_neighbors=2, algorithm='ball_tree')
            # train the model using the training sets and check score
            model.fit(x_train)
            distances, indices = nbrs.kneighbors(X)

        else:
            print('no method chosen')
            exit()

        # calculate accuracy
        numerator = 0.0
        denominator = float(len(predictions))
        for i in range(len(predictions)):
            match = True if (y_test[i] == predictions[i]) else False
            numerator += 1 if match else 0
            print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                               match))
        print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))
    else:
        print('no type chosen')
        exit()
Ejemplo n.º 14
0
#%%
for col in cols:
    hotlab[col] = data[col]

promoted = data[["is_promoted"]]

#%%
x_train, x_test, y_train, y_test = train_test_split(hotlab, promoted)

sm = SMOTE(random_state=20)

train_input_new, train_output_new = sm.fit_sample(x_train, y_train)

#%%
class1 = ExtraTreeClassifier()
class1.fit(x_train, y_train)
pred1 = class1.predict(x_test)
score = f1_score(y_test, pred1)

#%%

confussion = confusion_matrix(y_test, pred1)

#%%
#For submission
submission_data = pd.read_csv("D:\\Hackathons\\Promotion\\test_2umaH9m.csv")

#%%
submission_data["education"] = submission_data["education"].fillna("Unknown")
submission_data["previous_year_rating"] = submission_data["previous_year_rating"].fillna(np.mean(submission_data["previous_year_rating"]))
Ejemplo n.º 15
0
class stacked_generalization():
    def __init__(self, data, target):
        self.data = data
        if len(target.shape) == 2:
            # Convert 2-dim target array into 1-dim target array
            self.target = target.reshape(target.shape[0])
        else:
            self.target = target

        self.training_data = None
        self.training_target = None
        self.test_data = None
        self.test_target = None

        # Construct 3 Tier-1 (base) classifiers
        self.Tier1_classifier1 = LogisticRegression(solver="lbfgs")
        self.Tier1_classifier2 = MultinomialNB()
        self.Tier1_classifier3 = LinearSVC(penalty="l2")
        self.Tier1_classifier4 = ExtraTreeClassifier()
        # self.Tier1_classifier5 = SGDClassifier(max_iter=1000, tol=1e-3)

        # Construct Tier-2 (meta) classifier
        # self.meta_classifier = LogisticRegression(solver="lbfgs")
        # self.meta_classifier = MultinomialNB()
        # self.meta_classifier = LinearSVC(penalty = "l2")
        self.meta_classifier = ExtraTreeClassifier()
        # self.meta_classifier = XGBClassifier()
        # self.meta_classifier = RandomForestClassifier(n_estimators=100)

    # Divide training data into different n_split training blocks and evaluation blocks
    # Create T Tier-1 classifiers, C1,..,CT, based on a cross-validation partition of the training data. To do so,
    # the entire training dataset is divided into B blocks, and each Tier-1 classifier is first trained on (a different set of)
    # B-1 blocks of the training data. Each classifier is then evaluated on the Bth (pseudo-test) block
    def TrainingData_Stratified_KFold_split(self, n_split=5, shuffle=False):
        # Blocks of training data Partition. n_splits cannot be greater than the number of members in each class
        skf_blocks = StratifiedKFold(n_splits=n_split, shuffle=shuffle)

        # Creat the indexes of blocks of training data. The number of blocks is n_split
        training_blocks_index = []
        evaluation_blocks_index = []

        for trainingBlock_index, evaluationBlock_index in skf_blocks.split(
                self.training_data, self.training_target):
            training_blocks_index.append(trainingBlock_index)
            evaluation_blocks_index.append(evaluationBlock_index)

        training_blocks_data = [
            self.training_data[index, :] for index in training_blocks_index
        ]
        training_blocks_target = [
            self.training_target[index] for index in training_blocks_index
        ]

        evaluation_blocks_data = [
            self.training_data[index, :] for index in evaluation_blocks_index
        ]
        evaluation_blocks_target = [
            self.training_target[index] for index in evaluation_blocks_index
        ]

        return training_blocks_data, training_blocks_target, evaluation_blocks_data, evaluation_blocks_target

    def train_meta_classifier(self):
        training_blocks_data, training_blocks_target, evaluation_blocks_data, evaluation_blocks_target = self.TrainingData_Stratified_KFold_split(
        )

        # The classification outputs of all Tier-1 classifiers on each training data block (5 blocls now) are saved in list Tier1_outputs
        Tier1_outputs = []

        for block in range(len(training_blocks_data)):
            # all Tier-1 base classifiers fit n-1 training data blocks (n blocks totally)
            self.Tier1_classifier1.fit(training_blocks_data[block],
                                       training_blocks_target[block])
            self.Tier1_classifier2.fit(training_blocks_data[block],
                                       training_blocks_target[block])
            self.Tier1_classifier3.fit(training_blocks_data[block],
                                       training_blocks_target[block])
            self.Tier1_classifier4.fit(training_blocks_data[block],
                                       training_blocks_target[block])
            # self.Tier1_classifier5.fit(training_blocks_data[block],training_blocks_target[block])

            # All Tier-1 base classifiers fit nth training data blocks (n blocks totally).The outputs of all Tier-1 base
            # classifiers on each training data block (5 blocls now) are saved in list Tier1_outputs
            output_C1 = self.Tier1_classifier1.predict(
                evaluation_blocks_data[block])
            output_C1 = output_C1.reshape(output_C1.shape[0], 1)

            output_C2 = self.Tier1_classifier2.predict(
                evaluation_blocks_data[block])
            output_C2 = output_C2.reshape(output_C2.shape[0], 1)

            output_C3 = self.Tier1_classifier3.predict(
                evaluation_blocks_data[block])
            output_C3 = output_C3.reshape(output_C3.shape[0], 1)

            output_C4 = self.Tier1_classifier4.predict(
                evaluation_blocks_data[block])
            output_C4 = output_C4.reshape(output_C4.shape[0], 1)

            # output_C5 = self.Tier1_classifier5.predict(evaluation_blocks_data[block])
            # output_C5 = output_C5.reshape(output_C5.shape[0],1)

            # The classification outputs of all Tier-1 classifiers on each training data block (5 blocls now) are saved in list Tier1_outputs
            block_outputs = np.hstack((output_C1, output_C2, output_C3,
                                       output_C4))  # horizontally combined
            Tier1_outputs.append(block_outputs)

        # Vertically combine all training data blocks' classification outputs of all Tier-1 classifiers.
        # The function np.vstack() can be given a list
        Tier1_outputs = np.vstack(Tier1_outputs)
        # Combine all training data blocks' real labels
        evaluation_blocks_target = np.concatenate([
            eva_block_target for eva_block_target in evaluation_blocks_target
        ])

        # Using all training data blocks' classification outputs of all Tier-1 classifiers and all training data blocks'
        # real labels to train the meta classifier
        self.meta_classifier.fit(Tier1_outputs, evaluation_blocks_target)

        print("The training of meta classifier is finished")
        # return accuracy, recall and precision of test data

    # Train stacked generalization by cross-validation partition.
    def train_stacked_generalization_CV(self, n_split=5, shuffle=False):
        # Cross-validation Partition.  n_splits cannot be greater than the number of members in each class
        skf_cv = StratifiedKFold(n_splits=n_split, shuffle=shuffle)

        # Creat the indexes of training data and test data
        training_sets_index = []
        test_sets_index = []

        for training_index, test_index in skf_cv.split(self.data, self.target):
            training_sets_index.append(training_index)
            test_sets_index.append(test_index)

        training_sets_data = [
            self.data[index, :] for index in training_sets_index
        ]
        training_sets_target = [
            self.target[index] for index in training_sets_index
        ]

        test_sets_data = [self.data[index, :] for index in test_sets_index]
        test_sets_target = [self.target[index] for index in test_sets_index]

        # Store all metrics of cross-validation in different lists
        test_cv_accuracy = []
        test_cv_recall = []
        test_cv_precision = []

        time_start = time.time()  # start time

        for cv_time in range(n_split):
            self.training_data = training_sets_data[cv_time]
            self.training_target = training_sets_target[cv_time]
            self.test_data = test_sets_data[cv_time]
            self.test_target = test_sets_target[cv_time]

            # train the meta classifier
            self.train_meta_classifier()

            # Using all training data to retrain the all Tier-1 base classifiers
            self.Tier1_classifier1.fit(self.training_data,
                                       self.training_target)
            self.Tier1_classifier2.fit(self.training_data,
                                       self.training_target)
            self.Tier1_classifier3.fit(self.training_data,
                                       self.training_target)
            self.Tier1_classifier4.fit(self.training_data,
                                       self.training_target)
            # self.Tier1_classifier5.fit(self.training_data,self.training_target)

            # All retrained Tier-1 base classifiers are utilized to predict the test data
            testset_output_C1 = self.Tier1_classifier1.predict(self.test_data)
            testset_output_C1 = testset_output_C1.reshape(
                testset_output_C1.shape[0], 1)

            testset_output_C2 = self.Tier1_classifier2.predict(self.test_data)
            testset_output_C2 = testset_output_C2.reshape(
                testset_output_C2.shape[0], 1)

            testset_output_C3 = self.Tier1_classifier3.predict(self.test_data)
            testset_output_C3 = testset_output_C3.reshape(
                testset_output_C3.shape[0], 1)

            testset_output_C4 = self.Tier1_classifier4.predict(self.test_data)
            testset_output_C4 = testset_output_C4.reshape(
                testset_output_C4.shape[0], 1)

            # testset_output_C5 = self.Tier1_classifier5.predict(self.test_data)
            # testset_output_C5 = testset_output_C5.reshape(testset_output_C5.shape[0],1)

            # Horizontally combine all Tier-1 base classifiers' predictions on test data
            testset_outputs_Tier1 = np.hstack(
                (testset_output_C1, testset_output_C2, testset_output_C3,
                 testset_output_C4))

            # Based on predictions on test data, of all Tier-1 base classifiers , it would use the meta classifier to predict labels of test data
            testset_outputs_meta = self.meta_classifier.predict(
                testset_outputs_Tier1)
            # Round all predictions of meta classifier xgboost
            testset_outputs_meta = np.round(testset_outputs_meta)

            # Store all metrics of cross-validation in different lists
            test_cv_accuracy.append(
                accuracy_score(self.test_target, testset_outputs_meta))
            test_cv_recall.append(
                recall_score(self.test_target, testset_outputs_meta))
            test_cv_precision.append(
                precision_score(self.test_target, testset_outputs_meta))

        # Convert lists into numpy arrays, since only numpy arrays can be used to calculate mean values, min values, max values and std values
        test_cv_accuracy = np.array(test_cv_accuracy)
        test_cv_recall = np.array(test_cv_recall)
        test_cv_precision = np.array(test_cv_precision)

        time_end = time.time()  # end time
        print("\nTime cost: ", time_end - time_start, "seconds")

        cv_scores = {
            "test_accuracy": test_cv_accuracy,
            "test_precision_weighted": test_cv_recall,
            "test_recall_weighted": test_cv_precision
        }
        return cv_scores
Ejemplo n.º 16
0
X_train, X_test, y_train, y_test = train_test_split(data_values,
                                                    data_labels,
                                                    test_size=0.25,
                                                    random_state=42)

# In[ ]:

from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

parameters = {
    'base_estimator': [
        DecisionTreeClassifier(max_depth=3),
        DecisionTreeClassifier(max_depth=4),
        ExtraTreeClassifier(max_depth=4)
    ],
    'learning_rate': [0.01, 0.1, 0.5, 1.],
    'n_estimators': [5, 10, 15, 20, 30, 40, 50, 75, 100, 125],
    'algorithm': ['SAMME', 'SAMME.R']
}
model = AdaBoostClassifier()

AdaBoostClf = GridSearchCV(model, param_grid=parameters)
AdaBoostClf.fit(X_train, y_train)
score = AdaBoostClf.score(X_test, y_test)
prediction = AdaBoostClf.predict(X_test)
print("Accuracy using ", AdaBoostClf, " classifier is: ", score)
print("-------------------------------------------")
print("Below is the confusion Matrix for ", AdaBoostClf)
print(metrics.confusion_matrix(y_test, prediction))
Ejemplo n.º 17
0
def third_generation(X, y, size=200, seed=None):
    mlp_parameters = list(itertools.product([1, 2, 4, 8, 32, 128],\
                                            [0, 0.2, 0.5, 0.9],
                                            [0.1, 0.3, 0.6]))
    mlp_clf = [
        MLPClassifier(hidden_layer_sizes=(h, ),
                      momentum=m,
                      learning_rate_init=a) for (h, m, a) in mlp_parameters
    ]
    mlp_name = ['mlp_{0}_{1}_{2}'.format(*param) for param in mlp_parameters]

    neigbhors_number = [int(i) for i in np.linspace(1, X.shape[0], 40)]
    weighting_methods = ['uniform', 'distance']
    knn_clf = [
        KNeighborsClassifier(n_neighbors=nn, weights=w)
        for (nn, w) in itertools.product(neigbhors_number, weighting_methods)
    ]
    knn_name = [
        'knn_{0}_{1}'.format(*param) for param in itertools.product(
            neigbhors_number, ['uniform', 'distance'])
    ]
    C = np.logspace(-3, 7, num=11)
    degree = [2, 3, 4]
    gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
    svm_clf_poly = [
        SVC(C=c, kernel='poly', degree=d)
        for (c, d) in itertools.product(C, degree)
    ]
    svm_clf_poly_name = [
        'svm_poly_{0}_{1}'.format(*param)
        for param in itertools.product(C, degree)
    ]
    svm_clf_rbf = [
        SVC(C=c, kernel='rbf', gamma=g)
        for (c, g) in itertools.product(C, gamma)
    ]
    svm_clf_rbf_name = [
        'svm_rbf_{0}_{1}'.format(*param)
        for param in itertools.product(C, gamma)
    ]

    dt_params = list(itertools.product(['gini', 'entropy'], \
                                       [1, 2, 3, 4, 5, None], \
                                       [None, 'sqrt', 'log2'], \
                                       ['best', 'random']))
    dt_clf = [
        DecisionTreeClassifier(criterion=c,
                               max_depth=d,
                               max_features=f,
                               splitter=s) for (c, d, f, s) in dt_params
    ]
    dt_name = ['dt_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params]

    et_clf = [
        ExtraTreeClassifier(criterion=c,
                            max_depth=d,
                            max_features=f,
                            splitter=s) for (c, d, f, s) in dt_params
    ]
    et_name = ['et_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params]

    ada_params = list(itertools.product([2**i for i in range(1, 14)], \
                                        [1, 2, 3]))
    ada_dt_clf = [
        AdaBoostClassifier(n_estimators=n,
                           base_estimator=DecisionTreeClassifier(max_depth=m))
        for (n, m) in ada_params
    ]
    ada_et_clf = [
        AdaBoostClassifier(n_estimators=n,
                           base_estimator=ExtraTreeClassifier(max_depth=m))
        for (n, m) in ada_params
    ]
    ada_dt_name = ['ada_dt_{0}_{1}'.format(*param) for param in ada_params]
    ada_et_name = ['ada_et_{0}_{1}'.format(*param) for param in ada_params]

    nb_bag_est = 50
    nb_bag_stumps = 200
    bag_dt = BaggingClassifier(n_estimators=nb_bag_est,
                               base_estimator=DecisionTreeClassifier())
    bag_et = BaggingClassifier(n_estimators=nb_bag_est,
                               base_estimator=ExtraTreeClassifier())
    bag_stumps = BaggingClassifier(
        n_estimators=nb_bag_stumps,
        base_estimator=DecisionTreeClassifier(max_depth=1))
    bag_dt.fit(X, y)
    bag_et.fit(X, y)
    bag_stumps.fit(X, y)
    dt_bag_clf = bag_dt.estimators_
    et_bag_clf = bag_et.estimators_
    stump_bag_clf = bag_stumps.estimators_
    dt_bag_name = ['dt_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)]
    et_bag_name = ['et_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)]
    stump_bag_name = [
        'stump_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_stumps)
    ]

    bag_dt_clf = [bag_dt]
    bag_et_clf = [bag_dt]
    bag_stump_clf = [bag_stumps]
    bag_dt_name = ['bag_dt_{0}'.format(str(nb_bag_est))]
    bag_et_name = ['bag_et_{0}'.format(str(nb_bag_est))]
    bag_stump_name = ['bag_stump_{0}'.format(str(200))]

    nb_rf = 15
    rf = RandomForestClassifier(n_estimators=nb_rf)
    rf.fit(X, y)
    dt_rf_clf = rf.estimators_
    dt_rf_name = ['dt_rf_{0}'.format(nb_est) for nb_est in range(nb_rf)]

    log_parameters = list(itertools.product(['l1', 'l2'],\
                                            np.logspace(-5, 9, num=15),
                                            [True, False]))
    log_clf = [
        LogisticRegression(penalty=l, C=c, fit_intercept=f)
        for (l, c, f) in log_parameters
    ]
    log_name = ['log_{0}_{1}_{2}'.format(*param) for param in log_parameters]

    sgd_parameters = list(
        itertools.product([
            'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron',
            'squared_loss', 'huber', 'epsilon_insensitive',
            'squared_epsilon_insensitive'
        ], ['elasticnet'], [True, False], np.arange(0, 1.1, 0.1)))
    sgd_clf = [
        SGDClassifier(loss=l, penalty=p, fit_intercept=f, l1_ratio=l1)
        for (l, p, f, l1) in sgd_parameters
    ]
    sgd_name = [
        'sgd_{0}_{1}_{2}_{3}'.format(*param) for param in sgd_parameters
    ]

    pool = mlp_clf + knn_clf + svm_clf_poly + svm_clf_rbf + dt_clf + et_clf + ada_dt_clf + ada_et_clf + \
                dt_bag_clf + et_bag_clf + stump_bag_clf + bag_dt_clf + bag_et_clf + bag_stump_clf + dt_rf_clf + \
                log_clf + sgd_clf
    pool_name = mlp_name + knn_name + svm_clf_poly_name + svm_clf_rbf_name + dt_name + et_name + ada_dt_name + \
                ada_et_name + dt_bag_name + et_bag_name + stump_bag_name + bag_dt_name + bag_et_name + \
                bag_stump_name + dt_rf_name + log_name + sgd_name

    for model in pool:
        if not check_model_is_fitted(model, X[0, :].reshape((1, -1))):
            model.fit(X, y)

    np.random.seed(seed)
    order = np.random.permutation(range(len(pool)))
    estimators = [pool[i] for i in order[:size]]

    return estimators, pool_name
Ejemplo n.º 18
0
def run():
    data = pd.read_csv('train.csv')
    submission_data = pd.read_csv('test.csv')
    train_data, test_data = split_data(data, test_size=0.01)

    num_atribs = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
    cat_atribs = ['Sex', 'Embarked']

    transformed_train_data = transform_data(train_data,
                                            num_atribs=num_atribs,
                                            cat_atribs=cat_atribs)
    transformed_test_data = transform_data(test_data,
                                           num_atribs=num_atribs,
                                           cat_atribs=cat_atribs)
    transformed_submission_data = transform_data(submission_data,
                                                 num_atribs=num_atribs,
                                                 cat_atribs=cat_atribs,
                                                 no_fit=True)

    train_data_labels = train_data['Survived']
    test_data_labels = test_data['Survived']

    models = {
        0:
        RandomForestClassifier(max_depth=None,
                               max_leaf_nodes=None,
                               warm_start=True),
        1:
        LinearSVC(),
        2:
        NuSVC(),
        3:
        SVC(C=1.0),
        4:
        DecisionTreeClassifier(),
        5:
        ExtraTreeClassifier(),
        6:
        GaussianNB(),
        7:
        KNeighborsClassifier(),
        8:
        MLPClassifier(max_iter=1000),
        9:
        AdaBoostClassifier(),
        10:
        GaussianProcessClassifier(),
    }
    model_num_single_model = 0

    # MLP, SVC, RandForest

    # param_dist = {
    #     'C': list(range(1, 15)),
    #     'kernel': ['rbf', 'sigmoid'], #['rbf'], 'poly', 'linear', 'sigmoid', 'precomputed'],
    #     'degree': [3],
    #     'gamma': ['auto'],
    #     'coef0': [0.0],
    #     'shrinking': [True],
    #     'probability': [False],
    #     'tol': [1e-3],
    #     'cache_size': list(range(1, 2000)),
    #     'class_weight': [None],
    #     'verbose': [False],
    #     'max_iter': [-1],
    #     'decision_function_shape': ['ovr'],
    #     'random_state': [42]
    # }
    #
    # transformed_train_data, saved_pasid_train = drop_passenger_iD(transformed_train_data)
    # transformed_submission_data, saved_pasid_pred = drop_passenger_iD(transformed_submission_data)
    #
    # rand_search = RandomizedSearchCV(models[model_num_single_model], n_iter=10000, param_distributions=param_dist)
    # rand_search.fit(transformed_train_data, train_data_labels)
    # print(rand_search.best_estimator_)
    # submission_prediction = rand_search.predict(transformed_submission_data)
    #
    # df = pd.DataFrame()
    # df['PassengerId'] = saved_pasid_pred
    # df['Survived'] = submission_prediction
    #
    # df.to_csv(
    #     '{}submission_data_randcv_SVC.csv'.format(paths.get('saved_predictions_path')), index=False)

    if model_num_single_model is None:
        for _, model in models.items():
            # prediction_on_train_split = run_model(model, transformed_train_data, train_data_labels, transformed_test_data)
            submission_prediction = run_model(model, transformed_train_data,
                                              train_data_labels,
                                              transformed_submission_data)
            submission_prediction.to_csv('{}submission_data_{}.csv'.format(
                paths.get('saved_predictions_path'),
                type(model).__name__),
                                         index=False)
    else:
        model = models[model_num_single_model]
        submission_prediction = run_model(model, transformed_train_data,
                                          train_data_labels,
                                          transformed_submission_data)
        submission_prediction.to_csv('{}submission_data_{}.csv'.format(
            paths.get('saved_predictions_path'),
            type(model).__name__),
                                     index=False)

    df = pd.DataFrame()
    df['PassengerId'] = list(range(892, 1310))
    df['Survived'] = np.zeros(1310 - 892)
    df.to_csv('all_true.csv', index=False)
Ejemplo n.º 19
0
def main():
    """
    Given training data, this script trains and tracks each prediction for
    several algorithms and saves the predictions and ground truth to a CSV file
    """
    # Parameters for the training and predictions
    CV = 10

    subsets = ('fiss', 'act', 'fissact', 'all')
    subset = subsets[2]

    pkl_base = './pkl_trainsets/2jul2018/2jul2018_trainset'

    for trainset in ('1', '2'):
        pkl = pkl_base + trainset + '_nucs_' + subset + '_not-scaled.pkl'
        trainXY = pd.read_pickle(pkl)
        trainX, rY, cY, eY, bY = splitXY(trainXY)
        if subset == 'all':
            top_n = 100
            nuc_set = top_nucs(trainX, top_n)
            trainX = filter_nucs(trainX, nuc_set, top_n)
        trainX = scale(trainX)

        # loops through each reactor parameter to do separate predictions
        for Y in ('r', 'b', 'c', 'e'):
            trainY = pd.Series()
            # get param names and set ground truth
            if Y == 'c':
                trainY = cY
                parameter = 'cooling'
            elif Y == 'e':
                trainY = eY
                parameter = 'enrichment'
            elif Y == 'b':
                trainY = bY
                parameter = 'burnup'
            else:
                trainY = rY
                parameter = 'reactor'

            #######################
            # optimize parameters #
            #######################

            # initialize learners
            score = 'explained_variance'
            kfold = KFold(n_splits=CV, shuffle=True)
            alg1_init = DecisionTreeRegressor()
            alg2_init = ExtraTreeRegressor()
            alg3_init = BayesianRidge()
            if Y is 'r':
                score = 'accuracy'
                kfold = StratifiedKFold(n_splits=CV, shuffle=True)
                alg1_init = DecisionTreeClassifier(class_weight='balanced')
                alg2_init = ExtraTreeClassifier(class_weight='balanced')
                alg3_init = GaussianNB()

            # CV search the hyperparams
            # alg1
            alg1_grid = {
                "max_depth":
                np.linspace(3, 90).astype(int),
                "max_features":
                np.linspace(5,
                            len(trainXY.columns) - 6).astype(int)
            }
            alg1_opt = RandomizedSearchCV(estimator=alg1_init,
                                          param_distributions=alg1_grid,
                                          n_iter=20,
                                          scoring=score,
                                          n_jobs=-1,
                                          cv=kfold,
                                          return_train_score=True)
            alg1_opt.fit(trainX, trainY)
            alg1_init = alg1_opt.best_estimator_
            d1 = alg1_opt.best_params_['max_depth']
            f1 = alg1_opt.best_params_['max_features']

            # alg2
            alg2_grid = alg1_grid
            alg2_opt = RandomizedSearchCV(estimator=alg2_init,
                                          param_distributions=alg2_grid,
                                          n_iter=20,
                                          scoring=score,
                                          n_jobs=-1,
                                          cv=kfold,
                                          return_train_score=True)
            alg2_opt.fit(trainX, trainY)
            alg2_init = alg2_opt.best_estimator_
            d2 = alg2_opt.best_params_['max_depth']
            f2 = alg2_opt.best_params_['max_features']

            # alg3
            alg3_grid = {
                'n_iter': np.linspace(50, 1000).astype(int),
                'alpha_1': np.logspace(-8, 2),
                'alpha_2': np.logspace(-8, 2),
                'lambda_1': np.logspace(-8, 2),
                'lambda_2': np.logspace(-8, 2)
            }
            if Y is not 'r':
                alg3_opt = RandomizedSearchCV(estimator=alg3_init,
                                              param_distributions=alg3_grid,
                                              n_iter=20,
                                              scoring=score,
                                              n_jobs=-1,
                                              cv=kfold,
                                              return_train_score=True)
                alg3_opt.fit(trainX, trainY)
                alg3_init = alg3_opt.best_estimator_
                it = alg3_opt.best_params_['n_iter']
                a1 = alg3_opt.best_params_['alpha_1']
                a2 = alg3_opt.best_params_['alpha_2']
                l1 = alg3_opt.best_params_['lambda_1']
                l2 = alg3_opt.best_params_['lambda_2']

            # Save dat info
            param_file = 'trainset_' + trainset + '_hyperparameters_alt-algs.txt'
            with open(param_file, 'a') as pf:
                pf.write(
                    'The following parameters are best from the randomized search for the {} parameter prediction:\n'
                    .format(parameter))
                pf.write('max depth for dtree is {}\n'.format(d1))
                pf.write('max features for dtree is {}\n'.format(f1))
                pf.write('max depth for xtree is {}\n'.format(d2))
                pf.write('max features for xtree is {}\n'.format(f2))
                if Y is not 'r':
                    pf.write('num iterations for bayes reg is {}\n'.format(it))
                    pf.write('alpha 1 for bayes reg is {}\n'.format(a1))
                    pf.write('alpha 2 for bayes reg is {}\n'.format(a2))
                    pf.write('lambda 1 for bayes reg is {}\n'.format(l1))
                    pf.write('lambda 2 for bayes reg is {}\n'.format(l2))

            ########################
            # run predictions, etc #
            ########################

            #scores = ['explained_variance', 'neg_mean_absolute_error']
            #if Y is 'r':
            #    scores = ['accuracy', ]
            #csv_name = 'trainset_' + trainset + '_' + subset + '_' + parameter
            #
            #print("The {} predictions in trainset {} are beginning\n".format(parameter, trainset), flush=True)
            #
            ## track predictions
            #track_predictions(trainX, trainY, alg1_init, alg2_init, alg3_init, scores, kfold, csv_name)
            #print("\t Prediction tracking done\n", flush=True)

            ## calculate errors and scores
            #errors_and_scores(trainX, trainY, alg1_init, alg2_init, alg3_init, scores, kfold, csv_name)
            #print("\t CV scoring done\n", flush=True)

            ## learning curves
            #learning_curves(trainX, trainY, alg1_init, alg2_init, alg3_init, kfold, csv_name)
            #print("\t Learning curves done\n", flush=True)
            #
            #print("The {} predictions in trainset {} are complete\n".format(parameter, trainset), flush=True)
            #print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", flush=True)

    return
Ejemplo n.º 20
0
def main(tmpdir: str):
    # Define classification models, and hyperparams.

    decision_tree_classifier = SKLearnWrapper(
        DecisionTreeClassifier(),
        HyperparameterSpace({
            'criterion': Choice(['gini', 'entropy']),
            'splitter': Choice(['best', 'random']),
            'min_samples_leaf': RandInt(2, 5),
            'min_samples_split': RandInt(2, 4)
        }))

    extra_tree_classifier = SKLearnWrapper(
        ExtraTreeClassifier(),
        HyperparameterSpace({
            'criterion': Choice(['gini', 'entropy']),
            'splitter': Choice(['best', 'random']),
            'min_samples_leaf': RandInt(2, 5),
            'min_samples_split': RandInt(2, 4)
        }))

    ridge_classifier = Pipeline([
        OutputTransformerWrapper(NumpyRavel()),
        SKLearnWrapper(
            RidgeClassifier(),
            HyperparameterSpace({
                'alpha': Choice([0.0, 1.0, 10.0, 100.0]),
                'fit_intercept': Boolean(),
                'normalize': Boolean()
            }))
    ]).set_name('RidgeClassifier')

    logistic_regression = Pipeline([
        OutputTransformerWrapper(NumpyRavel()),
        SKLearnWrapper(
            LogisticRegression(),
            HyperparameterSpace({
                'C': LogUniform(0.01, 10.0),
                'fit_intercept': Boolean(),
                'penalty': Choice(['none', 'l2']),
                'max_iter': RandInt(20, 200)
            }))
    ]).set_name('LogisticRegression')

    random_forest_classifier = Pipeline([
        OutputTransformerWrapper(NumpyRavel()),
        SKLearnWrapper(
            RandomForestClassifier(),
            HyperparameterSpace({
                'n_estimators': RandInt(50, 600),
                'criterion': Choice(['gini', 'entropy']),
                'min_samples_leaf': RandInt(2, 5),
                'min_samples_split': RandInt(2, 4),
                'bootstrap': Boolean()
            }))
    ]).set_name('RandomForestClassifier')

    # Define a classification pipeline that lets the AutoML loop choose one of the classifier.
    # See also ChooseOneStepOf documentation: https://www.neuraxle.org/stable/api/steps/neuraxle.steps.flow.html#neuraxle.steps.flow.ChooseOneStepOf

    pipeline = Pipeline([
        ChooseOneStepOf([
            decision_tree_classifier, extra_tree_classifier, ridge_classifier,
            logistic_regression, random_forest_classifier
        ])
    ])

    # Create the AutoML loop object.
    # See also AutoML documentation: https://www.neuraxle.org/stable/api/metaopt/neuraxle.metaopt.auto_ml.html#neuraxle.metaopt.auto_ml.AutoML

    auto_ml = AutoML(
        pipeline=pipeline,
        hyperparams_optimizer=RandomSearchSampler(),
        validation_splitter=ValidationSplitter(validation_size=0.20),
        scoring_callback=ScoringCallback(accuracy_score,
                                         higher_score_is_better=True),
        n_trials=7,
        epochs=1,
        hyperparams_repository=HyperparamsOnDiskRepository(
            cache_folder=tmpdir),
        refit_best_trial=True,
        continue_loop_on_error=False)

    # Load data, and launch AutoML loop !

    X_train, y_train, X_test, y_test = generate_classification_data()
    auto_ml = auto_ml.fit(X_train, y_train)

    # Get the model from the best trial, and make predictions using predict, as per the `refit_best_trial=True` argument to AutoML.
    y_pred = auto_ml.predict(X_test)

    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
    print("Test accuracy score:", accuracy)

    shutil.rmtree(tmpdir)
Ejemplo n.º 21
0
def trainModel():

    #Read processed dataframe
    df = joblib.load('J:\Datasets\Exercises\Exercise5\EngineeredDataset.pkl')

    #Move test set indicator to the 2nd position of the dataframe
    cols = list(df)
    cols.insert(1, cols.pop(cols.index('TestSet')))
    df = df.ix[:, cols]

    # Split dataframe into target and features
    y = df.iloc[:, 0]  # .as_matrix()
    flag = pd.DataFrame(df.iloc[:, 1])  # .as_matrix()
    X = df.iloc[:, 2:]  # .as_matrix()

    # Apply standard scaler in order to remove mean and scale to unit variance (so large-valued features won't
    #heavily influence the model)
    sc = StandardScaler()

    # Apply scaler
    colNames = X.columns
    X = sc.fit_transform(X)
    X = pd.DataFrame(X, columns=colNames)

    # Remove features with less than 20% variance
    colNames = X.columns
    sel = VarianceThreshold(threshold=0.16)
    X = sel.fit_transform(X)
    # Get column names back
    newCols = []
    for remain, col in zip(sel.get_support(), colNames):
        if remain == True:
            newCols.append(col)
    X = pd.DataFrame(X, columns=newCols)

    #Perform dimensionality reduction using PCA
    pca = PCA(n_components=14)
    pca.fit(X)
    #PCA scree plot - aid in determining number of components
    plt.figure(1, figsize=(4, 3))
    plt.clf()
    plt.axes([.2, .2, .7, .7])
    plt.plot(pca.explained_variance_, linewidth=2)
    plt.axis('tight')
    plt.xlabel('n_components')
    plt.ylabel('explained_variance_')
    #plt.show()

    #Create PCA dataframe and append to original
    #Adding principle components adds additional insight to the dataframe
    #If PCs do not perform well, they will be removed in further feature selection procedures
    dfPCA = pd.DataFrame(pca.transform(X))
    newCols = []
    for col in dfPCA.columns:
        name = 'PCA' + str(col)
        newCols.append(name)
    dfPCA.columns = newCols
    X = pd.merge(X, dfPCA, left_index=True, right_index=True)

    # Perform univariate feature selection (ANOVA F-values)
    colNames = X.columns
    selection_Percent = SelectPercentile(percentile=50)
    X = selection_Percent.fit_transform(X, y)
    # Get column names back
    newCols = []
    for remain, col in zip(selection_Percent.get_support(), colNames):
        if remain == True:
            newCols.append(col)
    X = pd.DataFrame(X, columns=newCols)

    # Perform tree-based feature selection
    clf = ExtraTreeClassifier()
    clf = clf.fit(X, y)
    colNames = X.columns
    sel = SelectFromModel(clf, prefit=True)
    X = sel.transform(X)
    newCols = []
    for remain, col in zip(sel.get_support(), colNames):
        if remain == True:
            newCols.append(col)
    X = pd.DataFrame(X, columns=newCols)

    #Split train and test set
    #Create new test set column in X
    X['TestSet'] = flag['TestSet'].tolist()
    X['Target'] = y.tolist()
    #Encode target (to binary) - for ROC AUC metric (0 for under 50k, 1 for over)
    le = LabelEncoder()
    X['Target'] = le.fit_transform(X['Target'])
    #Copy in dfTest all the test set values from X
    dfTest = X.loc[X['TestSet'] == 1]
    #Re-write X with only learning set values
    X = X.loc[X['TestSet'] == 0]
    #Define test set target
    dfTestTarget = dfTest['Target']
    #Remove target and 'test set' column from test dataframe
    dfTest.drop(['TestSet', 'Target'], axis=1, inplace=True)
    #Create new learning target series
    y = X['Target']
    #Drop newly inserted columns from learning dataframe
    X.drop(['TestSet', 'Target'], axis=1, inplace=True)
    #Retain column names
    colNames = X.columns

    # The dataset is heavily imbalanced in terms of classes, and balancing procedures need to be conducted
    # Testing various under / over / combined sampling procedures
    # Some of these procedures are very computationally expensive (and thus are not suitable for home use e.g. SMOTEENN)
    rus = RandomUnderSampler()
    X, y = rus.fit_sample(X, y)
    #sme = SMOTEENN(n_jobs=-1)
    #X, y, = sme.fit_sample(X, y)
    X = pd.DataFrame(X, columns=colNames)
    y = pd.Series(y, name='Target')

    #Define train/test variables
    X_train = X
    y_train = y
    X_test = dfTest
    y_test = dfTestTarget

    def testClassifier(clf):

        #XGB tuning - concept, not in use
        param_grid = [{
            'max_depth': range(2, 6, 2),
            'min_child_weight': range(2, 6, 2),
            'n_estimators': range(100, 200, 75),
            'learning_rate': [0.1],
            'gamma': [0, 1, 10],
            'subsample': [0.6, 0.8],
            'colsample_bytree': [0.6, 0.8],
            'reg_alpha': [1, 10],
            'reg_lambda': [1, 10]
        }]
        fit_params = {
            "early_stopping_rounds": 8,
            "eval_metric": "map",
            "eval_set": [[X_test, y_test]],
            "verbose": False
        }
        grid = GridSearchCV(clf,
                            param_grid,
                            fit_params=fit_params,
                            cv=3,
                            verbose=1,
                            n_jobs=-1,
                            scoring='average_precision')
        fitted_classifier = grid.fit(X_train, y_train)
        print(grid.best_score_, grid.best_params_)

        predictions = fitted_classifier.predict(X_test)

        score1 = metrics.accuracy_score(y_test.values, predictions)
        score2 = metrics.roc_auc_score(y_test.values, predictions)
        score3 = metrics.cohen_kappa_score(y_test.values, predictions)
        score4 = metrics.classification_report(y_test.values, predictions)
        print('Accuracy score, ROC AUC, Cohen Kappa')
        print(score1, score2, score3)
        print('Classification Report')
        print(score4)

        print('Normal Fit')
        fitted = clf.fit(X_train, y_train)
        scoresCV = cross_val_score(clf,
                                   X_train,
                                   y_train,
                                   cv=3,
                                   verbose=0,
                                   n_jobs=-1)
        trainPredictionsCV = cross_val_predict(clf,
                                               X_train,
                                               y_train,
                                               cv=3,
                                               verbose=0,
                                               n_jobs=-1)

        trainPredictions = clf.predict(X_train)
        testPredictions = clf.predict(X_test)
        #X_test['Predictions'] = testPredictions

        score1 = metrics.accuracy_score(y_test.values, testPredictions)
        score2 = metrics.roc_auc_score(y_test.values, testPredictions)
        score3 = metrics.cohen_kappa_score(y_test.values, testPredictions)
        score4 = metrics.classification_report(y_test.values, testPredictions)
        print('Train score: ',
              metrics.accuracy_score(y_train.values, trainPredictions))
        print('CV score: ', scoresCV)
        print('Accuracy score, ROC AUC, Cohen Kappa')
        print(score1, score2, score3)
        print('Classification Report')
        print(score4)

        #WITH UNDER-SAMPLING
        #Low Precision in Class 1 (~0.28) = suggests that too many salaries are labeled as >50k when they are <50k
        #Could be a potential after-effect of under-sampling
        #High Recall in Class 1 (~0.90) = suggests that the classifier is able to find all positive samples

        #WITHOUT UNDER-SAMPLING
        #High Precision in Class 1 (~0.76) = suggests that the classifiers handles negative samples well
        #Low Recall in Class 1 (~0.39) = suggests that the classifier is not able to find all positive samples

        return clf

    '''print('LR')
    lr = LogisticRegression(C = 100)
    clf = testClassifier(lr)
    print('DT')
    dt = DecisionTreeClassifier()
    clf = testClassifier(dt)
    export_graphviz(clf, out_file = 'tree.dot')
    print('RF')
    rf = RandomForestClassifier()
    clf = testClassifier(rf)'''
    print('XGB')
    gb = xgboost.XGBClassifier()
    clf = testClassifier(gb)
######### Create New Dataframe
r2 = result.drop(["index"], axis=1)

# check new class counts
r2.fraudulent.value_counts()

###################################################################
########################## Feature Selection ######################
###################################################################

#Feature Selection using Tree Classifier
a = r2.iloc[:, :8]  #independent columns
b = r2.iloc[:, -1]  #target column

model = ExtraTreeClassifier()
model.fit(a, b)

print(model.feature_importances_
      )  #use inbuilt class feature_importances of tree based classifiers

#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=a.columns)
feat_importances.nlargest(8).plot(kind='barh')

#Almost all 8 variables are contributing towards output variable.

###############################################################
####################### Cross Validation ######################
###############################################################
Ejemplo n.º 23
0
from utils.IO import writeResToFile, loadResFromFile, getResultsFromFileAsArray, saveTableToFile
from utils.plot import generateChart
import pandas
from datasets.dataUrls import dataURL

x = dataURL[20]
url = "D:/projekty/UM/datasets/" + x + ".dat"
dataframe = pandas.read_csv(url)
array = dataframe.values
tmp = array[:, -1]
tmp2 = tmp == 'positive'
X = array[:, :-1]
y = tmp2.astype(int)

# drzewo klasyfikacyjne z domyślnymi wartościami parametrów
clf = ExtraTreeClassifier(random_state=1410)

# wielokrotna 5krotna walidacja krzyzowa (10x5)
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
scores = np.zeros((len(preprocs), 5 * 2, len(metrics)))

for fold_id, (train, test) in enumerate(rskf.split(X, y)):
    for preproc_id, preproc in enumerate(preprocs):
        clf = clone(clf)

        if preprocs[preproc] == None:
            X_train, y_train = X[train], y[train]
        else:
            X_train, y_train = preprocs[preproc].fit_resample(
                X[train], y[train])
Ejemplo n.º 24
0
classifiers.append(GaussianNB())

#Nearest Neighbors
classifiers.append(KNeighborsClassifier())

#Discrimnant analysis
classifiers.append(LinearDiscriminantAnalysis())

#Support vector machine
classifiers.append(SVC(random_state=random_state, probability=True))
classifiers.append(NuSVC(random_state=random_state, probability=True))
classifiers.append(LinearSVC(random_state=random_state))

#Trees
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(ExtraTreeClassifier(random_state=random_state))
"""
Accuracy cross validation for algorithms
"""
cf_results_acc = []
for classifier in classifiers:
    cf_results_acc.append(
        cross_val_score(classifier,
                        features_train,
                        y=target_train,
                        scoring="accuracy",
                        cv=kfold,
                        n_jobs=4))

#Means and standard deviation for each machine learning model utilized
cf_means_acc = []
Ejemplo n.º 25
0
                class_names = bi_class_target_attrs,
                filled = True, rounded = True,
                special_characters = True)
print(check_output('dot -Tpdf cart.dot -o cart.pdf', shell = True))
print("Accuracy = %s"%accuracy_score(rnd_test_y, clf_cart.predict(rnd_test_X)))
print("Precision = %s"%precision_score(rnd_test_y, clf_cart.predict(rnd_test_X)))
print("Recall = %s"%recall_score(rnd_test_y, clf_cart.predict(rnd_test_X)))
print("F = %s"%fbeta_score(rnd_test_y, clf_cart.predict(rnd_test_X), beta=1))
print("Confusion matrix = %s"%confusion_matrix(rnd_test_y, clf_cart.predict(rnd_test_X)))
roc_auc_scorer = get_scorer("roc_auc")
print("ROC AUC = %s"%roc_auc_scorer(clf_cart, rnd_test_X, rnd_test_y))
fpr, tpr, thresholds = roc_curve(rnd_test_y, clf_cart.predict_proba(rnd_test_X)[:, 1])
axes_roc.plot(fpr, tpr, label = 'CART-2')

## randomized tree with default setting
clf_rnd_tree = ExtraTreeClassifier()
clf_rnd_tree.fit(rnd_training_X, rnd_training_y)
export_graphviz(clf_rnd_tree, out_file = 'default_rnd_tree.dot',
                feature_names = attribute_names,
                class_names = bi_class_target_attrs,
                filled = True, rounded = True,
                special_characters = True)
print(check_output('dot -Tpdf default_rnd_tree.dot -o default_rnd_tree.pdf', shell = True))
print("Accuracy = %s"%accuracy_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X)))
print("Precision = %s"%precision_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X)))
print("Recall = %s"%recall_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X)))
print("F = %s"%fbeta_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X), beta=1))
print("Confusion matrix = %s"%confusion_matrix(rnd_test_y, clf_rnd_tree.predict(rnd_test_X)))
fpr, tpr, thresholds = roc_curve(rnd_test_y, clf_rnd_tree.predict_proba(rnd_test_X)[:, 1])
axes_roc.plot(fpr, tpr, label = "Randomized tree-1")
axes_roc.set_title("ROC of CART and a randomized tree")
Ejemplo n.º 26
0
class ExtraTreeClass:
    """
    Name      : ExtraTreeClassifier
    Attribute : None
    Method    : predict, predict_by_cv, save_model
    """
    def __init__(self):
        # 알고리즘 이름
        self._name = 'extratree'
        # 기본 경로
        self._f_path = os.path.abspath(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         os.pardir))

        # 경고 메시지 삭제
        warnings.filterwarnings('ignore')

        # 원본 데이터 로드
        data = pd.read_csv(self._f_path +
                           "/classifier/resource/classifier_sample.csv",
                           sep=",",
                           encoding="utf-8")

        # 학습 및 레이블(정답) 데이터 분리
        self._x = data.drop("quality", axis=1)
        self._y = data["quality"]

        # 학습 데이터 및 테스트 데이터 분리
        self._x_train, self._x_test, self._y_train, self._y_test = train_test_split(
            self._x, self._y, test_size=0.2, shuffle=True, random_state=42)
        # 모델 선언
        self._model = ExtraTreeClassifier()

        # 모델 학습
        self._model.fit(self._x_train, self._y_train)

    # 일반 예측
    def predict(self):
        # 예측
        y_pred = self._model.predict(self._x_test)

        # 리포트 출력
        print(classification_report(self._y_test, y_pred))

        score = accuracy_score(self._y_test, y_pred)

        # 스코어 확인
        print(f'Score = {score}')
        # 스코어 리턴
        return score

    #  CV 예측(Cross Validation)
    def predict_by_cv(self):
        cv = KFold(n_splits=5, shuffle=True)
        # CV 지원 여부
        if hasattr(self._model, "score"):
            cv_score = cross_val_score(self._model, self._x, self._y, cv=cv)
            # 스코어 확인
            print(f'Score = {cv_score}')
            # 스코어 리턴
            return cv_score
        else:
            raise Exception('Not Support CrossValidation')

    #  GridSearchCV 예측
    def predict_by_gs(self):
        pass

    # 모델 저장 및 갱신
    def save_model(self, renew=False):
        # 모델 저장
        if not renew:
            # 처음 저장
            joblib.dump(self._model, self._f_path + f'/model/{self._name}.pkl')
        else:
            # 기존 모델 대체
            if os.path.isfile(self._f_path + f'/model/{self._name}.pkl'):
                os.rename(
                    self._f_path + f'/model/{self._name}.pkl', self._f_path +
                    f'/model/{str(self._name) + str(time.time())}.pkl')
            joblib.dump(self._model, self._f_path + f'/model/{self._name}.pkl')

    def __del__(self):
        del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
Ejemplo n.º 27
0
#    data = ''
    with open(fname) as f:
        for s in f:
            tmp = map(int, s.split())
            labels.append(tmp[-1])
            res.append(tmp[:-1])
#            data += (str(tmp)[1:-1]).replace(',', '')+'\n'
#    with open('out.txt', 'w') as o:
#        o.write(str(data)[1:-1])
    return res, labels

X, Y = readData('german.data-numeric.txt')
Xt = X[:-200] ; Yt = Y[:-200]
XT = X[-200:] ; YT = Y[-200:]
print len(Xt)
clf = ExtraTreeClassifier(max_depth=None, random_state=0)
clf = clf.fit(Xt, Yt)

#proba = clf.predict_proba(XT)
#print len(proba)
#print proba

err = 0
for i, x in enumerate(XT):
    if clf.predict(x) != YT[i]: 
        prob = clf.predict_proba(x)
#        print prob
        err += 1

print err
Model = DecisionTreeClassifier()

Model.fit(X_train, y_train)

y_pred = Model.predict(X_test)

# Summary of the predictions made by the classifier
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
# Accuracy score
print('accuracy is', accuracy_score(y_pred, y_test))
# ExtraTreeClassifier
from sklearn.tree import ExtraTreeClassifier

Model = ExtraTreeClassifier()

Model.fit(X_train, y_train)

y_pred = Model.predict(X_test)

# Summary of the predictions made by the classifier
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
# Accuracy score
print('accuracy is', accuracy_score(y_pred, y_test))
import numpy as np


def sigmoid(z):
    return 1 / (1 + np.exp(-z))
    #
    print("CV error = %f +-%f" % (np.mean(scores), np.std(scores)))
    #
    print "Cross validation"
    scores = cross_val_score(RandomForestClassifier(), training, classes,
                             cv=KFold(n=len(training), n_folds=5, random_state=42),
                             scoring="accuracy")
    print("CV error = %f +-%f" % (1. - np.mean(scores), np.std(scores)))
    print("Accuracy =", accuracy_score(y_test, tlf.predict(X_test)))
    print("Precision =", precision_score(y_test, tlf.predict(X_test)))
    print("Recall =", recall_score(y_test, tlf.predict(X_test)))
    print("F =", fbeta_score(y_test, tlf.predict(X_test), beta=1))
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"

    print "Extra Tree classifier"
    rlf = ExtraTreeClassifier()
    rlf.fit(training, classes)

    print("Training error =", zero_one_loss(classes, rlf.predict(training)))

    X_train, X_test, y_train, y_test = train_test_split(training, classes)
    rlf = ExtraTreeClassifier()
    rlf.fit(X_train, y_train)
    print("Training error =", zero_one_loss(y_train, rlf.predict(X_train)))
    print("Test error =", zero_one_loss(y_test, rlf.predict(X_test)))

    scores = []
    print "K-fold cross validation"
    for train, test in KFold(n=len(training), n_folds=5, random_state=42):
        X_train, y_train = training[train], classes[train]
        X_test, y_test = training[test], classes[test]
#df.drop('character', axis=0, inplace=True)
#df = df.astype(np.uint8)

###############################
df_sample = df.sample(frac=0.1, random_state=0)

names = [
    'RidgeClassifier', 'BernoulliNB', 'GaussianNB', 'ExtraTreeClassifier',
    'DecisionTreeClassifier', 'NearestCentroid', 'KNeighborsClassifier',
    'ExtraTreesClassifier', 'RandomForestClassifier'
]
classifiers = [
    RidgeClassifier(),
    BernoulliNB(),
    GaussianNB(),
    ExtraTreeClassifier(),
    DecisionTreeClassifier(),
    NearestCentroid(),
    KNeighborsClassifier(),
    ExtraTreesClassifier(),
    RandomForestClassifier()
]
test_scores, train_scores, fit_time, score_time = [], [], [], []
return_train_score = "warn"
for clf in classifiers:
    scores = cross_validate(clf,
                            df_sample.iloc[:, :-1],
                            df_sample.iloc[:, -1],
                            return_train_score=True)
    test_scores.append(scores['test_score'].mean())
    train_scores.append(scores['train_score'].mean())
Ejemplo n.º 31
0
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'):
    #NOTE we might not need xtltrain
    # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present
    #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength
    # ytest is optional and depends on if you are using a testing set or the practice set

    # remove NaN, Inf, and -Inf values from the xtest feature matrix
    xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget)
    # print 'finished removal of Nans'

    ytrain = np.ravel(ytrain)
    ytarget = np.ravel(ytarget)


    #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector
    count = 0
    # print numfiers

    predictionMat = np.empty((xtest.shape[0],numfiers))
    predictionStringMat = []
    finalPredMat = []
    targetStringMat = []
    targets1 = []
    predictions1 = []

    # svc1 = SVC()
    # svc1.fit(xtrain,ytrain)
    # ytest = svc1.predict(xtest)
    # predictionMat[:,count] = ytest
    # count+=1
    if count < numfiers:
        # votingClassifiers combine completely different machine learning classifiers and use a majority vote
        clff1 = SVC()
        clff2 = RFC(bootstrap=False)
        clff3 = ETC()
        clff4 = neighbors.KNeighborsClassifier()
        clff5 = quadda()



        eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
        eclf = eclf.fit(xtrain,ytrain)
        #print(eclf.score(xtest,ytest))
        # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
        #     cla
        #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
        #     print ()
        ytest = eclf.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:

        bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
        bagging2.fit(xtrain,ytrain)
        #print bagging2.score(xtest,ytest)
        ytest = bagging2.predict(xtest)
        predictionMat[:,count] = ytest
        count += 1


    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        ytest = tree2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        ytest = bagging1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        svc1 = SVC()
        svc1.fit(xtrain,ytrain)
        ytest = svc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        ytest = qda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        ytest = tree1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        ytest = knn1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        ytest = lda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        ytest = tree3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        ytest = bagging3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        ytest = bagging4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        ytest = tree4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        ytest = tree6.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        ytest = knn2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        ytest = knn3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        ytest = knn4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        ytest = knn5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        ytest = ncc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        ytest = tree5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    # print xtltest
    # print len(ytest)
    for colCount in range(predictionMat.shape[1]):
        tempCol = predictionMat[:,colCount]
        if testing:
            modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0)
        else:
            modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0)

        ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0)
        if testing:
             modeStr = temppredVec2Str(modeCol,grids)
        else:
            modeStr = predVec2Str(modeCol)
        modeStrans = predVec2Str(ytarg)
        predictionStringMat.append(modeStr)
        predictions1.append(modeCol)
        finalPredMat += map(int,modeCol)
        targetStringMat.append(modeStrans)
        targets1.append(ytarg)
        if testing == False:
            if ytarget != None:
                #print targets1
                #print ""
                #print predictions1
                confusionme = confusion_matrix(targets1[0],predictions1[0])
                #print "Confusion Matrix is: "
                #print confusionme


    return predictionStringMat, targetStringMat, finalPredMat
def all_classifier_models():
    models = []
    metrix = []
    c_report = []
    train_accuracy = []
    test_accuracy = []
    
    models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr')))
    models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
    models.append(('KNeighborsClassifier', KNeighborsClassifier()))
    models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=100)))
    models.append(('SVM', SVC(gamma='auto')))
    models.append(('Linear_SVM', LinearSVC()))
    models.append(('XGB', XGBClassifier()))
    models.append(('SGD', SGDClassifier()))
    models.append(('Perceptron', Perceptron()))
    models.append(('ExtraTreeClassifier', ExtraTreeClassifier()))
    models.append(('OneClassSVM', OneClassSVM(gamma = 'auto')))
    models.append(('NuSVC', NuSVC()))
    models.append(('MLPClassifier', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)))
    models.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2.0)))
    models.append(('OutputCodeClassifier', OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),random_state=0)))
    models.append(('OneVsOneClassifier', OneVsOneClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('OneVsRestClassifier', OneVsRestClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('LogisticRegressionCV', LogisticRegressionCV()))
    models.append(('RidgeClassifierCV', RidgeClassifierCV()))
    models.append(('RidgeClassifier', RidgeClassifier()))
    models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier()))
    models.append(('GaussianProcessClassifier', GaussianProcessClassifier()))
    models.append(('HistGradientBoostingClassifier', HistGradientBoostingClassifier()))
    estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(random_state=42)))]
    models.append(('StackingClassifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())))
    clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
    clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf3 = GaussianNB()
    models.append(('VotingClassifier', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')))
    models.append(('AdaBoostClassifier', AdaBoostClassifier()))
    models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
    models.append(('BaggingClassifier', BaggingClassifier()))
    models.append(('ExtraTreesClassifier', ExtraTreesClassifier()))
    models.append(('CategoricalNB', CategoricalNB()))
    models.append(('ComplementNB', ComplementNB()))
    models.append(('BernoulliNB', BernoulliNB()))
    models.append(('MultinomialNB', MultinomialNB()))
    models.append(('CalibratedClassifierCV', CalibratedClassifierCV()))
    models.append(('LabelPropagation', LabelPropagation()))
    models.append(('LabelSpreading', LabelSpreading()))
    models.append(('NearestCentroid', NearestCentroid()))
    models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()))
    models.append(('GaussianMixture', GaussianMixture()))
    models.append(('BayesianGaussianMixture', BayesianGaussianMixture()))
    
    test_accuracy= []
    names = []
    for name, model in models:
        try:
            m = model
            m.fit(X_train, y_train)
            y_pred = m.predict(X_test)
            train_acc = round(m.score(X_train, y_train) * 100, 2)
            test_acc = metrics.accuracy_score(y_test,y_pred) *100
            c_report.append(classification_report(y_test, y_pred))
            test_accuracy.append(test_acc)
            names.append(name)
            metrix.append([name, train_acc, test_acc])
        except:
            print("Exception Occurred  :",name)
    return metrix,test_accuracy,names