Beispiel #1
0
def variables_relevantes_arbol(X, Y, alpha=None):

    if len(X) == 0:
        logger.info("No se ingreso informacion de variables")
        return []

    features = list(X.columns)

    if alpha == None:
        alpha = 1.0 / len(features)
        logger.info(
            'Se calcula el valor minimo de aceptacion de importancia: {0}'.
            format(alpha))

    try:
        model = ExtraTreeClassifier()
        model.fit(X, Y)

        importance = model.feature_importances_

        relevant_features = []
        for i in range(len(features)):
            if importance[i] > alpha:
                relevant_features.append(features[i])

    except Exception as e:
        logger.info(
            'Error con el metodo de arboles, no se determinaron variables relevantes: {0}'
            .format(e))
        relevant_features = []

    return importance, relevant_features
Beispiel #2
0
def dTree(data, labels, test, impurity="gini", mdepth=None):
    newData = pd.DataFrame()
    newTest = pd.DataFrame()
    le = LabelEncoder()
    for datum in data:
        newData[datum] = le.fit_transform(data[datum])
    for testItem in test:
        newTest[testItem] = le.fit_transform(test[testItem])
    tree1 = DecisionTreeClassifier(criterion=impurity,
                                   max_depth=mdepth,
                                   random_state=42)
    tree2 = ExtraTreeClassifier(criterion=impurity,
                                max_depth=mdepth,
                                random_state=42)
    tree3 = RandomForestClassifier(criterion=impurity,
                                   max_depth=mdepth,
                                   random_state=42)
    tree1.fit(newData, labels)
    tree2.fit(newData, labels)
    tree3.fit(newData, labels)
    predict1 = tree1.predict(newTest)
    print("tree1", evaluate(predict1, validation_genres))
    predict2 = tree2.predict(newTest)
    print("tree2", evaluate(predict2, validation_genres))
    predict3 = tree3.predict(newTest)
    print("tree3", evaluate(predict3, validation_genres))
    combined_prediction = voting([predict1, predict2, predict3], [1, 1, 1])
    return combined_prediction
Beispiel #3
0
def variables_relevantes_arbol(X, Y, alpha=None):

    if len(X) == 0:
        logger.info("No information was passed")
        return []

    features = list(X.columns)

    if alpha == None:
        alpha = 1.0 / len(features)
        logger.info(
            'Aceptance threshold for variable importance is calculated: {0}'.
            format(alpha))

    try:
        model = ExtraTreeClassifier()
        model.fit(X, Y)

        importance = model.feature_importances_

        relevant_features = []
        for i in range(len(features)):
            if importance[i] > alpha:
                relevant_features.append(features[i])

    except Exception as e:
        logger.info(
            'Error with the tree based model, : There was not relevant variables found{0}'
            .format(e))
        relevant_features = []

    return importance, relevant_features
Beispiel #4
0
def apply_extra_trees_classifier(trainData, targetTrain, testData, targetTest):
    """
    Applies decision tree algorithm on the dataset, by tuning various parameters

    Args:
        dataframe: The input trainData, testData and class label on which the decision tree algorithm has to be applied

    """
    # fit a CART model to the data
    etc = ExtraTreeClassifier(class_weight=None,
                              criterion='gini',
                              max_depth=None,
                              max_features='auto',
                              max_leaf_nodes=None,
                              min_samples_leaf=1,
                              min_samples_split=2,
                              min_weight_fraction_leaf=0.0,
                              random_state=None,
                              splitter='random')
    etc.fit(trainData, targetTrain)
    print(etc)
    # make predictions
    expected = targetTest
    predicted = etc.predict(testData)
    # summarize the fit of the model
    print(accuracy_score(expected, predicted))
def extratree(typ, X_train, Y_train, X_test, Y_test, text):
    text.delete(1.0, tk.END)
    text.insert(
        tk.END,
        "\n\nIMPORTING ExtraTree" + "\nProcessing this might take a while...",
        "bold")
    text.update_idletasks()
    from sklearn.tree import ExtraTreeClassifier
    ETC = ExtraTreeClassifier()
    ETC.fit(X_train, Y_train)
    Y_pred = ETC.predict(X_test)
    text.insert(
        tk.END, "\n\nExtra Tree Classifier report \n" +
        classification_report(Y_pred, Y_test), "bold")
    text.insert(
        tk.END,
        "*****roc_auc_score: %0.3f*****\n" % roc_auc_score(Y_pred, Y_test),
        "bold")
    text.insert(
        tk.END, "Extra Tree Classifier confusion matrix \n" +
        str(confusion_matrix(Y_pred, Y_test)), "bold")
    score = accuracy_score(Y_pred, Y_pred)
    text.insert(tk.END, "Extra tree score= ", score)
    text.update_idletasks()
    roc_curve_acc(Y_test, Y_pred, 'ETC')
    if typ == "s":
        plt.show()
    elif typ == "a":
        pass
Beispiel #6
0
class ExtraTreeClassifierTestCase(SchemaValidationTestCase, unittest.TestCase):
    def setUp(self):
        super().setUp()
        self.model = ExtraTreeClassifier()
        iris = load_iris()
        X = iris.data.astype(np.float32)
        y = iris.target.astype(np.int32)
        self.model.fit(X, y)
 def tree_select(self):
     # 树模型嵌入式特征选取
     clf = ExtraTreeClassifier(max_depth=7)
     clf.fit(self.X, self.y.ravel())
     feature_var = list(clf.feature_importances_)
     features = dict(zip(self.feature_names, feature_var))
     # print(features)
     features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[:self.select_feature_num]
     return set(features)
Beispiel #8
0
def get_feature_relevance_tree(X, y):
    vect = DictVectorizer()
    X = vect.fit_transform(X)

    tree = ExtraTreeClassifier(criterion='entropy')
    tree.fit(X, y)

    return zip(['general'],
               vect.inverse_transform(tree.feature_importances_.reshape(-1,
                                                                        1)))
Beispiel #9
0
def get_decision_tree(X, y, depth=None):
    vect = DictVectorizer()
    X = vect.fit_transform(X)

    tree = ExtraTreeClassifier(max_depth=depth)
    tree.fit(X, y)

    return export_graphviz(tree,
                           feature_names=vect.feature_names_,
                           class_names=tree.classes_,
                           filled=True)
Beispiel #10
0
def trees_models(x_train, y_train):
    from sklearn.tree import DecisionTreeClassifier
    classifier1 = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier1.fit(x_train, y_train)

    from sklearn.tree import ExtraTreeClassifier
    classifier2 = ExtraTreeClassifier()
    classifier2.fit(x_train, y_train)

    print('DecisionTreeClassifier training accuracy: ',
          classifier1.score(x_train, y_train))
    print('ExtraTreesClassifier training accuracy: ',
          classifier2.score(x_train, y_train))

    return classifier1, classifier2
Beispiel #11
0
def test_extra_tree_clf():
    X, y = load_iris(return_X_y=True)
    X_ = X.tolist()
    for y_ in [y, (y == 0).astype(int), (y == 2).astype(int)]:
        for max_depth in [5, 10, None]:
            clf = ExtraTreeClassifier()
            clf.fit(X, y_)
            clf_ = convert_estimator(clf)

            for method in METHODS:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    scores = getattr(clf, method)(X)
                scores_ = getattr(clf_, method)(X_)
                assert np.allclose(scores.shape, shape(scores_))
                assert np.allclose(scores, scores_, equal_nan=True)
Beispiel #12
0
    def RecommendByET(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """多标签分类  """

        clf = ExtraTreeClassifier()
        clf.fit(train_data, train_data_y)
        predictions = clf.predict_proba(test_data)
        """预测结果转化为data array"""
        predictions = DataProcessUtils.convertMultilabelProbaToDataArray(predictions)
        print(predictions)

        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)
        answerList = test_data_y
        print(predictions)
        print(test_data_y)
        print(recommendList)
        print(answerList)
        return [recommendList, answerList]
Beispiel #13
0
 def extra_tree_classifier(self):
     self.log.writeToLog('Running Extra Tree Classifier Model...')
     X_train, X_test, y_train, y_test = self.train_test_split()
     et = ExtraTreeClassifier()
     trained_model = et.fit(X_train, y_train)
     self.save_pickle(trained_model)
     y_pred = et.predict(X_test)
     self.model_auc_roc(y_test, y_pred, "Extra Tree Classifier Model")
     self.model_evaluation(y_test, y_pred, "Extra Tree Classifier Model")
Beispiel #14
0
 def extra_tree(self):
     x_train, x_test, y_train, y_test = self.preprocessing()
     extra_tree_model = ExtraTreeClassifier()
     y_pred = extra_tree_model.fit(x_train, y_train).predict(x_test)
     acc = accuracy_score(y_test, y_pred)
     print('Extra Tree Classifier:- ', acc)
     conf = confusion_matrix(y_test, y_pred)
     f1 = f1_score(y_test, y_pred, average='micro')
     print('and its f1 score- ', f1)
     print('confusion matrix: \n', conf)
    def fit(self, X, y):
        """Build a random decision tree based classifier from the training set (X, y)."""

        # Remove protected features
        X_protect = np.delete(X, [self.prot_class], axis=1)

        num_tr = len(y)
        num_prot_1 = sum(X[:, self.prot_class])
        num_prot_0 = num_tr - num_prot_1

        #X_protect = X
        i = 0
        fair_trees = []
        predictions = []

        # Pick up fair trees
        while i < self.num_fair_trees:
            new_tree = ExtraTreeClassifier(
                max_depth=self.max_depth,
                min_samples_leaf=self.min_samples_leaf,
                max_features=1)
            new_tree.fit(X_protect, y)
            new_prediction = new_tree.predict(X_protect)

            # Calculate the probability we predict someone will dropout between groups (Statistical Parity)
            num_pred_1 = len([
                e for e in range(0, num_tr)
                if new_prediction[e] == 0 and X[e, self.prot_class] == 1
            ])
            num_pred_0 = len([
                e for e in range(0, num_tr)
                if new_prediction[e] == 0 and X[e, self.prot_class] == 0
            ])
            stat_parity = abs(num_pred_1 / num_prot_1 -
                              num_pred_0 / num_prot_0)

            if stat_parity < self.rho:
                i += 1
                fair_trees.append(new_tree)
                predictions.append(new_prediction)

        self.ridge_model.fit(np.transpose(np.asarray(predictions)), y)
        self.decision_trees = fair_trees
Beispiel #16
0
def read_results(data, model_name):
    with open('data.json') as data_json:
        data_params = json.load(data_json)

    # Prepare data
    data_path = os.path.join(DATA_PATH, data_params['data'][data]['file_name'])
    print('Read file: {}'.format(data_path))
    X, y = load_csv(data_path)

    # Apply scaling
    scaler = MinMaxScaler().fit(X)
    X = scaler.transform(X)

    n_test = data_params['data'][data]['n_test']
    random_state = RANDOM_STATE
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=n_test, random_state=random_state)

    model = ExtraTreeClassifier(random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    acc_train = model.score(X_train, y_train)
    acc_test = model.score(X_test, y_test)
    print(('Train Acc: {:.4f}, ' + 'Test Acc: {:.4f}').format(
        acc_train, acc_test))

    df = pd.DataFrame(columns=COLUMNS)
    for attack in ATTACKS_NUM:
        for defence in DEFENCES_NUM:
            try:
                df = get_dataframe_sklearn(df, model, data, model_name, attack,
                                           defence)
            except FileNotFoundError as err:
                print(err)
                continue

    # These attacks have no hyperparameter
    df.loc[(df['Attack'] == 'boundary') | (df['Attack'] == 'tree'),
           'Adv_param'] = np.nan

    output_file = os.path.join(
        OUTPUT_PATH, '{}_{}_{}.csv'.format(data, model_name, VERSION))
    df.to_csv(output_file)
    print('Save to:', output_file)
Beispiel #17
0
def train_extratree_model():
    results_extratree_model = {}
    results_extratree_model['acc'] = []
    results_extratree_model['p_r_f1_s'] = []
    for i in range(30):
        train_features, train_labels = get_train_data()
        test_features, test_labels = get_test_data()

        clf = ExtraTreeClassifier()
        clf.fit(train_features, train_labels)
        predictions = clf.predict(test_features)
        p_r_f1_s = precision_recall_fscore_support(test_labels, predictions)
        acc = accuracy_score(test_labels, predictions)
        print("ExtraTree Model Classifier : ", acc)
        print(
            "ExtraTree Model Classifier Precision, Recall, F1-Score, Support: ",
            p_r_f1_s)
        results_extratree_model['acc'].append(acc)
        results_extratree_model['p_r_f1_s'].append(p_r_f1_s)
        time.sleep(10)
    pickle.dump(results_extratree_model,
                open('results_extratree_model.pkl', 'wb'))
class ExtraTreeClassifier(Classifier):
    def __init__(self, matrixdatabase):
        self._matrix_database = matrixdatabase
        self._has_fit = False
        self._etc = ETC()

    def learn(self, ingredients, cuisine):
        return

    def classify(self, ingredients):
        if not self._has_fit:
            matrix, classes = self._matrix_database.make_train_matrix()
            self._etc = self._etc.fit(matrix, classes)
            print('Fitting complete...')
            self._has_fit = True
        output = self._etc.predict(
            self._matrix_database.make_row_from_recipe(ingredients))
        return output[0]
class ExtraTreeClassifier(Classifier):
	
	def __init__(self, matrixdatabase):
		self._matrix_database = matrixdatabase
		self._has_fit = False
		self._etc = ETC()

	def learn(self, ingredients, cuisine):
		return

	def classify(self, ingredients):
		if not self._has_fit:
			matrix, classes = self._matrix_database.make_train_matrix()
			self._etc = self._etc.fit(matrix, classes)
			print 'Fitting complete...'
			self._has_fit = True
		output = self._etc.predict(self._matrix_database.make_row_from_recipe(ingredients))
		return output[0]
Beispiel #20
0
"去除低方差特征,统计学方法"
# np.random.seed(10)
# arr = np.random.random((5, 6))
# var = 0.4
# arrnew = VarianceThreshold(0.4 * var * (1 - var)).fit(arr).transform(arr)
# print(arr)
# print(arrnew)

"单变量特征选择,统计学方法"
X, y = load_iris(return_X_y=True)
print(X.shape, y.shape)

"SelectFromModel 只包含两种方法"
clf = ExtraTreeClassifier()
clf.fit(X, y)
print(clf.feature_importances_)

model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
print(X_new.shape)


# "基于原始数据"
# DTC = RandomForestClassifier(n_estimators=20)
# DTC.fit(X, y)
# score1 = cross_val_score(DTC, X, y, cv=5)
# print(np.mean(score1))
#
# "单变量特征选择,统计学方法"
# Xnew = SelectKBest(chi2, k=2).fit_transform(X, y)  # 保留特征的个数
Beispiel #21
0
#zero variance removal

#from sklearn.feature_selection import VarianceThreshold
#var=VarianceThreshold()
#
#train_X=var.fit_transform(train_X)

#train_X[train_X._get_numeric_data().columns]=var.fit_transform(train_X._get_numeric_data())

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
clf=ExtraTreeClassifier(criterion="entropy",random_state=100)
#clf=RandomForestClassifier(n_estimators=100,criterion="entropy",random_state=100)
clf.fit(train_X,train_y)
print(clf.feature_importances_)
    
for name, importance in zip(train_X.columns, clf.feature_importances_):
    print(name, "=", importance)
    
imp_datafram=pd.DataFrame(list(zip(train_X.columns, clf.feature_importances_)))

#import matplotlib.pyplot as plt
#
#y_pos = np.arange(len(imp_datafram[0]))
#performance = imp_datafram[1]
# 
#plt.bar(y_pos, performance, align='center', alpha=0.5)
#plt.xticks(y_pos, imp_datafram[0])
Beispiel #22
0
#    data = ''
    with open(fname) as f:
        for s in f:
            tmp = map(int, s.split())
            labels.append(tmp[-1])
            res.append(tmp[:-1])
#            data += (str(tmp)[1:-1]).replace(',', '')+'\n'
#    with open('out.txt', 'w') as o:
#        o.write(str(data)[1:-1])
    return res, labels

X, Y = readData('german.data-numeric.txt')
Xt = X[:-200] ; Yt = Y[:-200]
XT = X[-200:] ; YT = Y[-200:]
print len(Xt)
clf = ExtraTreeClassifier(max_depth=None, random_state=0)
clf = clf.fit(Xt, Yt)

#proba = clf.predict_proba(XT)
#print len(proba)
#print proba

err = 0
for i, x in enumerate(XT):
    if clf.predict(x) != YT[i]: 
        prob = clf.predict_proba(x)
#        print prob
        err += 1

print err
def myclassify(numfiers=5,xtrain=xtrain,ytrain=ytrain,xtest=xtest,ytest=ytest):
    count = 0



    bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
    bagging2.fit(xtrain,ytrain)
    #print bagging2.score(xtest,ytest)
    count += 1
    classifiers = [bagging2.score(xtest,ytest)]

    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        #print tree2.fit(xtrain,ytrain)
        #print tree2.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree2.score(xtest,ytest))
        print "1"
        print tree2.score(xtest,ytest)

    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging1.score(xtest,ytest))
        print "2"
        print bagging1.score(xtest,ytest)

#     if count < numfiers:
#         # votingClassifiers combine completely different machine learning classifiers and use a majority vote
#         clff1 = SVC()
#         clff2 = RFC(bootstrap=False)
#         clff3 = ETC()
#         clff4 = neighbors.KNeighborsClassifier()
#         clff5 = quadda()
#         print"3"


#         eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
#         eclf = eclf.fit(xtrain,ytrain)
#         #print(eclf.score(xtest,ytest))
#         # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
#         #     cla
#         #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
#         #     print ()
#         count+=1
#         classifiers = np.append(classifiers,eclf.score(xtest,ytest))


#     if count < numfiers:
#         svc1 = SVC()
#         svc1.fit(xtrain,ytrain)
#         dec = svc1.score(xtest,ytest)
#         count+=1
#         classifiers = np.append(classifiers,svc1.score(xtest,ytest))
#         print "3"

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,qda.score(xtest,ytest))
        print "4"


    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        #print tree1.fit(xtrain,ytrain)
        #print tree1.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree1.score(xtest,ytest))

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        #print(knn1.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn1.score(xtest,ytest))

    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        #print(lda.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,lda.score(xtest,ytest))

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        #print tree3.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree3.score(xtest,ytest))

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        #print bagging3.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging3.score(xtest,ytest))


    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        #print bagging4.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging4.score(xtest,ytest))

    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        #print tree4.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree4.score(xtest,ytest))

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        #print(tree6.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,tree6.score(xtest,ytest))

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        #print(knn2.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn2.score(xtest,ytest))

    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        #print(knn3.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn3.score(xtest,ytest))

    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        #print(knn4.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn4.score(xtest,ytest))

    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        #print(knn5.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn5.score(xtest,ytest))

    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        #print (ncc1.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,ncc1.score(xtest,ytest))

    if count < numfiers:
    # Nearest shrunken Centroid
        for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]:
            ncc2 = NearestCentroid(shrink_threshold = shrinkage)
            ncc2.fit(xtrain,ytrain)
            #print(ncc2.score(xtest,ytest))

        count+=1
        classifiers = np.append(classifiers,ncc2.score(xtest,ytest))

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        #print(tree5.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,tree5.score(xtest,ytest))

    classifierlabel = ["BaggingETC (with bootstraps set to false)","ETC","BaggingETC","Voting Classifier","svm","QDA","DTC","KNN (default)","LDA","RFC",
                       "BaggingRFC (with bootstraps set to false)","BaggingSVC (with bootstraps set to false)","RFC (bootstrap false)","GBC",
                        "knn (n_neighbors = 10)","knn (n_neighbors = 3)","knn (ball tree algorithm)","knn (kd_tree algorithm)",
                       "Nearest Centroid","Shrunken Centroid?","ABC"]


    classifierlabel = classifierlabel[:len(classifiers)]
    #print len(classifiers)
    #print classifiers
    for i in range(len(classifiers)):


        print ("{} classifier has percent correct {}".format(classifierlabel[i],classifiers[i]))
Beispiel #24
0
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

# In[ ]:

DTree = DecisionTreeClassifier(max_depth=3)
DTree.fit(x_train, y_train)
yhat = DTree.predict(x_test)
print("DecisionTreeClassifier")
print("Train set Accuracy: ",
      metrics.accuracy_score(y_train, DTree.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

# In[ ]:

ETree = ExtraTreeClassifier(max_depth=3)
ETree.fit(x_train, y_train)
yhat = ETree.predict(x_test)
print("ExtraTreeClassifier")
print("Train set Accuracy: ",
      metrics.accuracy_score(y_train, ETree.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

# In[ ]:

Ada = AdaBoostClassifier()
Ada.fit(x_train, y_train)
yhat = Ada.predict(x_test)
print("AdaBoostClassifier")
print("Train set Accuracy: ",
      metrics.accuracy_score(y_train, Ada.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
Beispiel #25
0
def main():
    # filepath: sentence data file path
    # vecfile: word vector file path pre-generated from other
    # vectype: compression methods. Average, avg+tf-idf one line, agg+tf-idf whole data
    # vec_path: vector file save path

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/stem_testdata'  # 'data/data_test'
    vecfile = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.50d.txt'

    vec_files = [
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.50d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.100d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.200d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.300d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.42B.300d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.840B.300d.txt'
    ]
    # don't know why yet, relative file path having permission deny
    # so we're using absolute path for now
    vec_path = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/word_vector/'

    # Here, we can choose type of vectorization
    # there are 6 word vector file downloaded from glove
    """
    vectype = 1
    for v in vec_files:
        start_time = time.time()
        name = v.split('/')[-1][:-4] + '_vec'
        print(name, 'vectorization in process')
        word_vec_gen(filepath, v, vectype, vec_path+name)
        print("--- %s seconds ---" % (time.time() - start_time))

    vectype = 2
    for v in vec_files:
        start_time = time.time()
        name = v.split('/')[-1][:-4] + '_vec_OnelineTF'
        print(name, 'vectorization in process')
        word_vec_gen(filepath, v, vectype, vec_path + name)
        print("--- %s seconds ---" % (time.time() - start_time))

    vectype = 3
    for v in vec_files:
        start_time = time.time()
        name = v.split('/')[-1][:-4] + '_vec_WholeDataTF'
        print(name, 'vectorization in process')
        word_vec_gen(filepath, v, vectype, vec_path + name)
        print("--- %s seconds ---" % (time.time() - start_time))
    """

    # from here, will earase.

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test'  # 'data/stem_testdata'
    #filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/hyp1-hyp2-ref'
    vectype = 1
    start_time = time.time()
    name = vecfile.split('/')[-1][:-4] + '_vec_diffOrder'
    #print(name, 'vectorization in process')
    #word_vec_gen(filepath, vecfile, vectype, vec_path + name)
    #print("--- %s seconds ---" % (time.time() - start_time))

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test'  # 'data/stem_testdata'
    vectype = 2
    start_time = time.time()
    name = vecfile.split('/')[-1][:-4] + '_vec_OnelineTF'
    #print(name, 'vectorization in process')
    #word_vec_gen(filepath, vecfile, vectype, vec_path + name)
    #print("--- %s seconds ---" % (time.time() - start_time))

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test'  # 'data/stem_testdata'
    vectype = 3
    start_time = time.time()
    name = vecfile.split('/')[-1][:-4] + '_vec_WholeDataTF'
    #print(name, 'vectorization in process')
    #word_vec_gen(filepath, vecfile, vectype, vec_path + name)
    #print("--- %s seconds ---" % (time.time() - start_time))

    vec_path = 'data/word_vector/glove.6B.50d_vec_diffOrder'
    wvec = load_wordvec(vec_path)
    target_path = 'data/dev.answers'
    answer = load_target(target_path)

    from sklearn.model_selection import train_test_split
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.tree import ExtraTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import NuSVC
    from sklearn.multiclass import OneVsOneClassifier
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.svm import LinearSVC

    clf1 = KNeighborsClassifier()
    clf2 = DecisionTreeClassifier()
    clf3 = ExtraTreeClassifier()
    clf4 = MLPClassifier()
    clf5nu = NuSVC()
    clf6lin = LinearSVC()
    # 'sag', 'saga' and 'lbfgs' ’

    print("Training Starts")
    X_train, X_test, y_train, y_test = train_test_split(wvec,
                                                        answer,
                                                        test_size=0.10,
                                                        random_state=42)
    #clf1.fit(X_train, y_train)
    clf1.fit(X_train, y_train)
    print('KNeighborsClassifier score 50d', clf1.score(X_test, y_test))
    clf2.fit(X_train, y_train)
    print('DecisionTreeClassifier score 50d', clf2.score(X_test, y_test))
    clf3.fit(X_train, y_train)
    print('ExtraTreeClassifier score 50d', clf3.score(X_test, y_test))
    clf4.fit(X_train, y_train)
    print('MLPClassifier score 50d', clf4.score(X_test, y_test))

    clf1 = OneVsRestClassifier(KNeighborsClassifier())
    clf2 = OneVsRestClassifier(DecisionTreeClassifier())
    clf3 = OneVsRestClassifier(ExtraTreeClassifier())
    clf4 = OneVsRestClassifier(MLPClassifier())
    clf5 = OneVsOneClassifier(NuSVC())
    clf6 = OneVsRestClassifier(LinearSVC())

    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import PassiveAggressiveClassifier
    clf7 = OneVsRestClassifier(SGDClassifier())
    clf8 = OneVsRestClassifier(Perceptron())
    clf9 = OneVsRestClassifier(PassiveAggressiveClassifier())

    print('One vs Rest methods case::')
    print('KNeighborsClassifier score 50d',
          clf1.fit(X_train, y_train).score(X_test, y_test))
    print('DecisionTreeClassifier score 50d',
          clf2.fit(X_train, y_train).score(X_test, y_test))
    print('ExtraTreeClassifier score 50d',
          clf3.fit(X_train, y_train).score(X_test, y_test))
    print('MLPClassifier score 50d',
          clf4.fit(X_train, y_train).score(X_test, y_test))

    print('SGDClassifier score 50d',
          clf7.fit(X_train, y_train).score(X_test, y_test))
    print('Perceptron score 50d',
          clf8.fit(X_train, y_train).score(X_test, y_test))
    print('PassiveAggressiveClassifier score 50d',
          clf9.fit(X_train, y_train).score(X_test, y_test))

    print('NuSVC score 50d', clf5.fit(X_train, y_train).score(X_test, y_test))
    print('LinearSVC score 50d',
          clf6.fit(X_train, y_train).score(X_test, y_test))

    clf5nu.fit(X_train, y_train)
    print('NuSVC score 50d', clf5nu.score(X_test, y_test))
    clf6lin.fit(X_train, y_train)
    print('LinearSVC score 50d', clf6lin.score(X_test, y_test))

    from sklearn.datasets import make_friedman1
    from sklearn.feature_selection import RFECV
    from sklearn.neighbors import KNeighborsClassifier
    estimator = DecisionTreeClassifier()
test_set = df.iloc[train_data_len:, :]

#print(train_set.head(5))
train_x = train_set.iloc[:, 0:6]
train_y = train_set.iloc[:, 6:]

#print(type(train_y))

#train_y.reshape(len(train_y), )

#print(train_y.head(5))
test_x = test_set.iloc[:, 0:6]
test_y = test_set.iloc[:, 6:]

#test_y.reshape(len(test_y), )

#print(train_x.head(5))
#print(train_y.head(5))

from sklearn.tree import ExtraTreeClassifier
classifier = ExtraTreeClassifier(random_state=0,
                                 criterion="entropy",
                                 splitter="best")

classifier.fit(train_x, train_y.values.ravel())

info = classifier.score(test_x, test_y.values.ravel())

print(info)
#model = Sequential()
Beispiel #27
0
f1_score(y_test, smote_pred)
   
recall_score(y_test, smote_pred)


###################################################################
########################## Feature Selection ######################
###################################################################


#Feature Selection using Tree Classifier
a = r2.iloc[:,0:19]  #independent columns
b = r2.iloc[:,-1]    #target column

model = ExtraTreeClassifier()
model.fit(a,b)

print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers

#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=a.columns)
feat_importances.nlargest(19).plot(kind='barh')


###############################################################
####################### Cross Validation ######################
###############################################################


colnames = list(r2.columns)
predictors = colnames[:19]
def myclassify_AudPow(numfiers,xtrain_1,xtrain_2,ytrain_1,ytrain_2,xtest):

    # remove NaN, Inf, and -Inf values from the xtest feature matrix
    xtest = xtest[~np.isnan(xtest).any(axis=1),:]
    xtest = xtest[~np.isinf(xtest).any(axis=1),:]

    xtrain = np.append(xtrain_1,xtrain_2,0)
    ytrain = np.append(ytrain_1,ytrain_2)
    ytrain = np.ravel(ytrain)
    xtrunclength = sio.loadmat('../Files/xtrunclength.mat')
    xtrunclength = xtrunclength['xtrunclength'][0]



    #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector
    count = 0
    # print numfiers

    predictionMat = np.empty((xtest.shape[0],numfiers))
    predictionStringMat = []
    finalPredMat = []

    bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
    bagging2.fit(xtrain,ytrain)
    #print bagging2.score(xtest,ytest)
    ytest = bagging2.predict(xtest)
    predictionMat[:,count] = ytest
    count += 1


    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        ytest = tree2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        ytest = bagging1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # votingClassifiers combine completely different machine learning classifiers and use a majority vote
        clff1 = SVC()
        clff2 = RFC(bootstrap=False)
        clff3 = ETC()
        clff4 = neighbors.KNeighborsClassifier()
        clff5 = quadda()



        eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
        eclf = eclf.fit(xtrain,ytrain)
        #print(eclf.score(xtest,ytest))
        # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
        #     cla
        #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
        #     print ()
        ytest = eclf.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        svc1 = SVC()
        svc1.fit(xtrain,ytrain)
        ytest = svc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        ytest = qda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        ytest = tree1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        ytest = knn1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        ytest = lda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        ytest = tree3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        ytest = bagging3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        ytest = bagging4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        ytest = tree4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        ytest = tree6.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        ytest = knn2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        ytest = knn3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        ytest = knn4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        ytest = knn5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        ytest = ncc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        ytest = tree5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    for colCount in range(predictionMat.shape[1]):
        tempCol = predictionMat[:,colCount]
        modeCol = predWindowVecModeFinder(tempCol,xtrunclength)
        modeStr = predVec2Str(modeCol)
        predictionStringMat.append(modeStr)
        finalPredMat += map(int,modeCol)

    return predictionStringMat,finalPredMat
Beispiel #29
0
                filled = True, rounded = True,
                special_characters = True)
print(check_output('dot -Tpdf cart.dot -o cart.pdf', shell = True))
print("Accuracy = %s"%accuracy_score(rnd_test_y, clf_cart.predict(rnd_test_X)))
print("Precision = %s"%precision_score(rnd_test_y, clf_cart.predict(rnd_test_X)))
print("Recall = %s"%recall_score(rnd_test_y, clf_cart.predict(rnd_test_X)))
print("F = %s"%fbeta_score(rnd_test_y, clf_cart.predict(rnd_test_X), beta=1))
print("Confusion matrix = %s"%confusion_matrix(rnd_test_y, clf_cart.predict(rnd_test_X)))
roc_auc_scorer = get_scorer("roc_auc")
print("ROC AUC = %s"%roc_auc_scorer(clf_cart, rnd_test_X, rnd_test_y))
fpr, tpr, thresholds = roc_curve(rnd_test_y, clf_cart.predict_proba(rnd_test_X)[:, 1])
axes_roc.plot(fpr, tpr, label = 'CART-2')

## randomized tree with default setting
clf_rnd_tree = ExtraTreeClassifier()
clf_rnd_tree.fit(rnd_training_X, rnd_training_y)
export_graphviz(clf_rnd_tree, out_file = 'default_rnd_tree.dot',
                feature_names = attribute_names,
                class_names = bi_class_target_attrs,
                filled = True, rounded = True,
                special_characters = True)
print(check_output('dot -Tpdf default_rnd_tree.dot -o default_rnd_tree.pdf', shell = True))
print("Accuracy = %s"%accuracy_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X)))
print("Precision = %s"%precision_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X)))
print("Recall = %s"%recall_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X)))
print("F = %s"%fbeta_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X), beta=1))
print("Confusion matrix = %s"%confusion_matrix(rnd_test_y, clf_rnd_tree.predict(rnd_test_X)))
fpr, tpr, thresholds = roc_curve(rnd_test_y, clf_rnd_tree.predict_proba(rnd_test_X)[:, 1])
axes_roc.plot(fpr, tpr, label = "Randomized tree-1")
axes_roc.set_title("ROC of CART and a randomized tree")
axes_roc.set_xlabel("FPR")
        # separate the data from the target attributes
        test_data = dataset2.drop('change_id', axis=1)
        test_data = test_data.drop('411_commit_time', axis=1)
        test_data = test_data.drop('412_full_path', axis=1)

        # remove unnecessary features
        #test_data = test_data.drop('File', axis=1)

        # the lables of test data
        test_target = dataset2.Buggy

        #print(test_target)

        from imblearn.over_sampling import RandomOverSampler
        ros = RandomOverSampler(random_state=0)
        X_resampled, y_resampled = ros.fit_resample(train_data, train_target)
        test_data_resampled, test_target_resampled = ros.fit_resample(
            test_data, test_target)

        clf = ExtraTreeClassifier(splitter='best')

        test_pred = clf.fit(X_resampled,
                            y_resampled).predict(test_data_resampled)

        file.write(
            classification_report(test_target_resampled,
                                  test_pred,
                                  labels=[0, 1]))
        file.write("\n")
file.close()
def build_separate_tree(X,y,max_features,max_depth,min_samples_split):
	clf = ExtraTreeClassifier(max_features=max_features,max_depth=max_depth,min_samples_split=min_samples_split)
	clf = clf.fit(X,y)
	return clf
Beispiel #32
0
mix_estimators = [
    ('le', LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='multinomial')),
    ('te', rn.choice(tree_estimators)[1]), *rn.sample(boost_estimators, 4), *rn.sample(nb_estimators, 2)]
voting_classifier_mix = VotingClassifier(estimators=mix_estimators)

all_estimators = [('lgr',  LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='multinomial')),
                  *tree_estimators, *boost_estimators, *nb_estimators]
voting_classifier_all = VotingClassifier(estimators=all_estimators)


# Train all models, (it will take some time)
logistic_regression.fit(x_train, y_train)

tree_classifier.fit(x_train, y_train)
extra_tree_classifier.fit(x_train, y_train)

adaboost_classifier.fit(x_train, y_train)
extra_trees_classifier.fit(x_train, y_train)
bagging_classifier.fit(x_train, y_train)
gradient_boost_classifier.fit(x_train, y_train)

mlp_classifier.fit(x_train, y_train)
gaussian_nb.fit(x_train, y_train)

voting_classifier_tree.fit(x_train, y_train)
voting_classifier_boost.fit(x_train, y_train)
voting_classifier_nb.fit(x_train, y_train)
voting_classifier_mix.fit(x_train, y_train)
voting_classifier_all.fit(x_train, y_train)
Beispiel #33
0
#data = np.array(data)

print('Finish Label Encode')

clf = ExtraTreeClassifier(random_state=103, splitter='random', max_features=9)

##Get dataset
X = np.array(traindata.iloc[:, :10])
y = np.array(traindata.iloc[:, 10])

##build decision tree
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=11)
clf.fit(X_train, y_train)

print('Finish Extra tree training')

predicttest = clf.predict(X_test)

##count click (0 or 1)
countClick = [0, 0]
for i in predicttest:
    if i == 0:
        countClick[0] += 1
    else:
        countClick[1] += 1
print(countClick)

##get accuracy, precision, recall, f_measure
Beispiel #34
0
dfscore.plot(kind='barh')   

# Label Encoding
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y.iloc[:]=le.fit_transform(y.iloc[:])

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x.iloc[:,:]=sc.fit_transform(x.iloc[:,:])

# Feature Importance
from sklearn.tree import ExtraTreeClassifier
classifier=ExtraTreeClassifier()
classifier.fit(x,y)
importance=pd.Series(classifier.feature_importances_,index=x.columns)
importance.plot(kind='barh')

# Segregating training & testing data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

# Modelling
# Extreme Gradient Boosting:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
gb=BaggingClassifier(DecisionTreeClassifier(),n_estimators=20,max_samples=0.5,max_features=1)
gb.fit(x_train,y_train)
gb.score(x_train,y_train)
gb.score(x_test,y_test)
Beispiel #35
0
print("Feature ranking:")
for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plotting featre importance
plt.figure(figsize=(10,5))
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="g", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), features,rotation=90)
plt.xlim([-1, X.shape[1]])
plt.show()    
# Importing,intitiating and fitting Extra trees classifier
from sklearn.tree import ExtraTreeClassifier
extree = ExtraTreeClassifier(max_features=11,min_samples_split=21,
                             random_state=101,max_depth =28)
extree.fit(X_train_sm1,y_train_sm1)
extree_predict=extree.predict(X_test)
#checking performacne of the extra trees classifier
print(confusion_matrix(y_test,extree_predict))
print(classification_report(y_test,extree_predict))
#Importing test data
test=pd.read_csv('FIA_predictions.csv')
# getting columns same as training data 
test=test.iloc[:,0:33]
#converting data type for categorical variables
test['NAICS2']=test['NAICS2'].astype('category')
test['NAICS4']=test['NAICS4'].astype('category')
test['NAICS_CD']=test['NAICS_CD'].astype('category')
test['Restricted_Vertical']=test['Restricted_Vertical'].astype('category')
test['LCTN_TYP_VAL']=test['LCTN_TYP_VAL'].astype('category')
test['srvc_five_dgt_zip']=test['srvc_five_dgt_zip'].astype('category')
 def extra_tree(self):
     x_train, x_test, y_train, y_test = self.preprocessing()
     extra_tree_model = ExtraTreeClassifier()
     y_pred = extra_tree_model.fit(x_train, y_train).predict(x_test)
     self.printing(y_test, y_pred, 'Extra Tree')
### TREESSSSS
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier as DTC
tree1 = DTC()
print tree1
tree1.fit(xtrain,ytrain1)
print tree1.fit(xtrain,ytrain1)
print tree1.score(xtest,ytest1)


# In[22]:

from sklearn.tree import ExtraTreeClassifier as ETC
tree2 = ETC()
print tree2
tree2.fit(xtrain,ytrain1)
print tree2.fit(xtrain,ytrain1)
print tree2.score(xtest,ytest1)


# In[23]:

from sklearn.ensemble import BaggingClassifier
bagging1 = BaggingClassifier(ETC())
bagging1.fit(xtrain,ytrain1)
print bagging1.score(xtest,ytest1)


# In[24]:

from sklearn.ensemble import BaggingClassifier
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'):
    #NOTE we might not need xtltrain
    # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present
    #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength
    # ytest is optional and depends on if you are using a testing set or the practice set

    # remove NaN, Inf, and -Inf values from the xtest feature matrix
    xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget)
    # print 'finished removal of Nans'

    ytrain = np.ravel(ytrain)
    ytarget = np.ravel(ytarget)


    #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector
    count = 0
    # print numfiers

    predictionMat = np.empty((xtest.shape[0],numfiers))
    predictionStringMat = []
    finalPredMat = []
    targetStringMat = []
    targets1 = []
    predictions1 = []

    # svc1 = SVC()
    # svc1.fit(xtrain,ytrain)
    # ytest = svc1.predict(xtest)
    # predictionMat[:,count] = ytest
    # count+=1
    if count < numfiers:
        # votingClassifiers combine completely different machine learning classifiers and use a majority vote
        clff1 = SVC()
        clff2 = RFC(bootstrap=False)
        clff3 = ETC()
        clff4 = neighbors.KNeighborsClassifier()
        clff5 = quadda()



        eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
        eclf = eclf.fit(xtrain,ytrain)
        #print(eclf.score(xtest,ytest))
        # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
        #     cla
        #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
        #     print ()
        ytest = eclf.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:

        bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
        bagging2.fit(xtrain,ytrain)
        #print bagging2.score(xtest,ytest)
        ytest = bagging2.predict(xtest)
        predictionMat[:,count] = ytest
        count += 1


    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        ytest = tree2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        ytest = bagging1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        svc1 = SVC()
        svc1.fit(xtrain,ytrain)
        ytest = svc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        ytest = qda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        ytest = tree1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        ytest = knn1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        ytest = lda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        ytest = tree3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        ytest = bagging3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        ytest = bagging4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        ytest = tree4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        ytest = tree6.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        ytest = knn2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        ytest = knn3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        ytest = knn4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        ytest = knn5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        ytest = ncc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        ytest = tree5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    # print xtltest
    # print len(ytest)
    for colCount in range(predictionMat.shape[1]):
        tempCol = predictionMat[:,colCount]
        if testing:
            modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0)
        else:
            modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0)

        ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0)
        if testing:
             modeStr = temppredVec2Str(modeCol,grids)
        else:
            modeStr = predVec2Str(modeCol)
        modeStrans = predVec2Str(ytarg)
        predictionStringMat.append(modeStr)
        predictions1.append(modeCol)
        finalPredMat += map(int,modeCol)
        targetStringMat.append(modeStrans)
        targets1.append(ytarg)
        if testing == False:
            if ytarget != None:
                #print targets1
                #print ""
                #print predictions1
                confusionme = confusion_matrix(targets1[0],predictions1[0])
                #print "Confusion Matrix is: "
                #print confusionme


    return predictionStringMat, targetStringMat, finalPredMat
    print("CV error = %f +-%f" % (np.mean(scores), np.std(scores)))
    #
    print "Cross validation"
    scores = cross_val_score(RandomForestClassifier(), training, classes,
                             cv=KFold(n=len(training), n_folds=5, random_state=42),
                             scoring="accuracy")
    print("CV error = %f +-%f" % (1. - np.mean(scores), np.std(scores)))
    print("Accuracy =", accuracy_score(y_test, tlf.predict(X_test)))
    print("Precision =", precision_score(y_test, tlf.predict(X_test)))
    print("Recall =", recall_score(y_test, tlf.predict(X_test)))
    print("F =", fbeta_score(y_test, tlf.predict(X_test), beta=1))
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"

    print "Extra Tree classifier"
    rlf = ExtraTreeClassifier()
    rlf.fit(training, classes)

    print("Training error =", zero_one_loss(classes, rlf.predict(training)))

    X_train, X_test, y_train, y_test = train_test_split(training, classes)
    rlf = ExtraTreeClassifier()
    rlf.fit(X_train, y_train)
    print("Training error =", zero_one_loss(y_train, rlf.predict(X_train)))
    print("Test error =", zero_one_loss(y_test, rlf.predict(X_test)))

    scores = []
    print "K-fold cross validation"
    for train, test in KFold(n=len(training), n_folds=5, random_state=42):
        X_train, y_train = training[train], classes[train]
        X_test, y_test = training[test], classes[test]
        rlf = ExtraTreeClassifier().fit(X_train, y_train)