def test_sample_weight():
    """Tests sample_weight parameter of VotingClassifier"""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = SVC(probability=True, random_state=123)
    eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('svc', clf3)],
        voting='soft').fit(X, y, sample_weight=np.ones((len(y),)))
    eclf2 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('svc', clf3)],
        voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))

    sample_weight = np.random.RandomState(123).uniform(size=(len(y),))
    eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft')
    eclf3.fit(X, y, sample_weight)
    clf1.fit(X, y, sample_weight)
    assert_array_equal(eclf3.predict(X), clf1.predict(X))
    assert_array_equal(eclf3.predict_proba(X), clf1.predict_proba(X))

    clf4 = KNeighborsClassifier()
    eclf3 = VotingClassifier(estimators=[
        ('lr', clf1), ('svc', clf3), ('knn', clf4)],
        voting='soft')
    msg = ('Underlying estimator \'knn\' does not support sample weights.')
    assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight)
def test_set_params():
    """set_params should be able to set estimators"""
    clf1 = LogisticRegression(random_state=123, C=1.0)
    clf2 = RandomForestClassifier(random_state=123, max_depth=None)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft',
                             weights=[1, 2])
    assert_true('lr' in eclf1.named_estimators)
    assert_true(eclf1.named_estimators.lr is eclf1.estimators[0][1])
    assert_true(eclf1.named_estimators.lr is eclf1.named_estimators['lr'])
    eclf1.fit(X, y)
    assert_true('lr' in eclf1.named_estimators_)
    assert_true(eclf1.named_estimators_.lr is eclf1.estimators_[0])
    assert_true(eclf1.named_estimators_.lr is eclf1.named_estimators_['lr'])

    eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft',
                             weights=[1, 2])
    eclf2.set_params(nb=clf2).fit(X, y)
    assert_false(hasattr(eclf2, 'nb'))

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params())
    assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params())

    eclf1.set_params(lr__C=10.0)
    eclf2.set_params(nb__max_depth=5)

    assert_true(eclf1.estimators[0][1].get_params()['C'] == 10.0)
    assert_true(eclf2.estimators[1][1].get_params()['max_depth'] == 5)
    assert_equal(eclf1.get_params()["lr__C"],
                 eclf1.get_params()["lr"].get_params()['C'])
def acc_VotingClassifier():
    kf = KFold(900, n_folds=10,shuffle=True)
    acc = 0.0
    temp = 1
    conf_mat = [[0 for i in range(10)] for j in range(10)]
    clf1 = GaussianNB()
    clf2 = RandomForestClassifier(n_estimators=20,max_features=None,class_weight="balanced_subsample")
    clf3 = SVC(kernel='rbf', probability=False)
    clf4 = LogisticRegression()
    eclf = VotingClassifier(estimators=[('gnb', clf1), ('rf', clf2),  ('lr', clf4)], voting='hard', weights=[1,3,3])
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        eclf = eclf.fit(X_train, y_train)
        y_predict = eclf.predict(X_test)
        acc_loop = getAccuracy(y_predict,y_test)
        conf_mat = buildConfusionMatrix(conf_mat,y_predict,y_test)
        print("*** Accuracy*** for "+str(temp)+"th time: "+str(acc_loop))
        acc += acc_loop
        temp +=1
    # Checking if the data set is transformed into MFCC(13) or FFT(1000) or KPCA features(else)
    if (X.shape[1]==13):
        print 'In 13 features if'
        valid_mfcc = eclf.predict(validation_set_mfcc)
    elif (X.shape[1]==1000):
        print 'In 1000 features elif'
        valid_fft = eclf.predict(validation_set_fft)
    elif (X.shape[1]==100):
        print 'In KPCA features else'
        valid_kpca = eclf.predict(validation_set_kpca)
    acc = (acc/10.0)
    printConfusionMatrix(conf_mat)
    return acc, getAccuracyFromConfusion(conf_mat),valid_mfcc, valid_fft, valid_kpca
Beispiel #4
0
def classify():
    train_X,Y = load_svmlight_file('data/train_last')
    test_X,test_Y = load_svmlight_file('data/test_last')
    train_X = train_X.toarray()
    test_X = test_X.toarray()
    Y = [int(y) for y in Y]
    # print 'Y:',len(Y)
    rows = pd.read_csv('data/log_test2.csv',index_col=0).sort_index().index.unique()
    train_n = train_X.shape[0]
    m = train_X.shape[1]
    test_n = test_X.shape[0]
    print train_n,m,#test_n
     # 先用训练集训练出所有的分类器
    print 'train classify...'
    clf1 = LinearDiscriminantAnalysis()
    clf2 = GaussianNB()
    clf3 = LogisticRegression()
    clf4 = RandomForestClassifier()
    clf5 = KNeighborsClassifier(n_neighbors=12)
    clf6 = AdaBoostClassifier()
    # x_train,x_test,y_train,y_test = train_test_split(train_X,Y,test_size=0.2) # 对训练集进行划分

    # print x_train.shape
    # print x_test.shape
    # clf.fit(train_X,Y)
    clf = VotingClassifier(estimators=[('la',clf1),('nb',clf2),('lr',clf3),('rf',clf4),('nn',clf5),('ac',clf6)], voting='soft', weights=[1.5,1,1,1,1,1])
    # clf1.fit(x_train,y_train)
    # clf2.fit(x_train,y_train)
    # clf3.fit(x_train,y_train)
    # clf4.fit(x_train,y_train)
    clf.fit(train_X,Y)
    print 'end train classify'

    print 'start classify....'
    # print metrics.classification_report(Y,predict_Y)
    # clf2.fit(train_X,Y)
    # print 'clf2 fited...'
    # clf3.fit(train_X,Y)
    # print 'clf3 fited...'
    # clf4.fit(train_X,Y)
    # print 'clf4 fited...'
    # clf1.fit(train_X,Y)
    # print 'clf1 fited...'
    # 第一个分类结果
    predict_Y = clf.predict(train_X)
    # predict_Y = clf.predict(train_X)
    print 'classify result:'
    print metrics.classification_report(Y,predict_Y)

    predict_Y = clf.predict(test_X)
    # print predict_Y,len(predict_Y)
    print 'end classify...'
    # predict_Y = clf.predict(X[cnt_train:]) # 训练注释这一行,输出测试集打开这一行,注释之后的print metric
    # predict_Y = clf.predict(test_X) # 训练注释这一行,输出测试集打开这一行,注释之后的print metric
    DataFrame(predict_Y,index=rows).to_csv('data/info_test2.csv', header=False)
def test_predict_for_hard_voting():
    # Test voting classifier with non-integer (float) prediction
    clf1 = FaultySVC(random_state=123)
    clf2 = GaussianNB()
    clf3 = SVC(probability=True, random_state=123)
    eclf1 = VotingClassifier(estimators=[
        ('fsvc', clf1), ('gnb', clf2), ('svc', clf3)], weights=[1, 2, 3],
        voting='hard')

    eclf1.fit(X, y)
    eclf1.predict(X)
def test_set_estimator_none():
    """VotingClassifier set_params should be able to set estimators as None"""
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard', weights=[1, 0, 0.5]).fit(X, y)

    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard', weights=[1, 1, 0.5])
    eclf2.set_params(rf=None).fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert_true(dict(eclf2.estimators)["rf"] is None)
    assert_true(len(eclf2.estimators_) == 2)
    assert_true(all([not isinstance(est, RandomForestClassifier) for est in
                     eclf2.estimators_]))
    assert_true(eclf2.get_params()["rf"] is None)

    eclf1.set_params(voting='soft').fit(X, y)
    eclf2.set_params(voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = ('All estimators are None. At least one is required'
           ' to be a classifier!')
    assert_raise_message(
        ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y)

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft', weights=[0, 0.5],
                             flatten_transform=False).fit(X1, y1)

    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft', weights=[1, 0.5],
                             flatten_transform=False)
    eclf2.set_params(rf=None).fit(X1, y1)
    assert_array_almost_equal(eclf1.transform(X1),
                              np.array([[[0.7, 0.3], [0.3, 0.7]],
                                        [[1., 0.], [0., 1.]]]))
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1., 0.],
                                         [0., 1.]]]))
    eclf1.set_params(voting='hard')
    eclf2.set_params(voting='hard')
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_parallel_predict():
    """Check parallel backend of VotingClassifier on toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
    y = np.array([1, 1, 2, 2])

    eclf1 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1).fit(X, y)
    eclf2 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2).fit(X, y)

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
def voting_class(X,training_target,Y):
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import VotingClassifier
    
    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')
    eclf.fit(X[:,0:6],training_target)
    proba = eclf.predict_proba(Y[:,0:6])
    
    eclf.predict()
Beispiel #9
0
def voting_fit(X, y, RESULT_TEST_PATH,RESULT_PATH):
    ada_best = fit_adaboost(X, y)
    extratree_best = fit_extratree(X, y)
    rf_best = fit_rf(X, y)
    gbdt_best = fit_xgboost(X, y)
    svc_best = fit_svc(X, y)
    lr_best = fit_lr(X, y)

    votingC = VotingClassifier(estimators=[('rfc', rf_best), ('extc', extratree_best),('lr',lr_best),
                                            ('adac', ada_best), ('gbc', gbdt_best)], voting='soft',
                               n_jobs=4)
    votingC.fit(X, y)

    test_df = pd.read_csv(RESULT_TEST_PATH)
    test = np.array(test_df)

    #test_Survived = pd.Series(votingC.predict(test), name="Survived")

    result = votingC.predict(test)
    test_df.insert(test_df.columns.size, 'Survived', result)

    test_df = test_df[['PassengerId', 'Survived']]
    test_df['PassengerId'] = test_df['PassengerId'].apply(np.int64)
    test_df.to_csv(RESULT_PATH, index=False)
    print("finish!")
Beispiel #10
0
 def predict(self,X_test):
     '''
     predict the class for each sample
     '''
     if self.use_append == True:
         self.__X_test = X_test
     elif self.use_append == False:
         temp = []
     
     # first stage
     for clf in self.stage_one_clfs:
         y_pred = clf[1].predict(X_test)
         y_pred  = np.reshape(y_pred,(len(y_pred),1))
         if self.use_append == True:
             self.__X_test = np.hstack((self.__X_test,y_pred)) 
         elif self.use_append == False:
             temp.append(y_pred)
     
     if self.use_append == False:
         self.__X_test = np.array(temp).T[0]
     
     # second stage
     majority_voting = VotingClassifier(estimators=self.stage_two_clfs, voting="hard", weights=self.weights)
     y_out = majority_voting.predict(self.__X_test)
     return y_out
Beispiel #11
0
def main(directory, tools_directory, non_tools_dir):
    global path
    path = sys.path[0]
    start = time.time()
    if directory is None or not os.path.isdir(directory):
        print "Please input directory containing pdf publications to classify"
        sys.exit(1)
    x_train, y_train = fetch_from_file()
    x_test, test_files = get_test_set(directory)
    # Just for testing, update machine learning part later

    x_train, x_test = normalize_scale(x_train, x_test)
    classifier = VotingClassifier(
        [("first", classifier_list[0]), ("second", classifier_list[1]), ("second", classifier_list[2])]
    )
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    if os.path.isdir(tools_directory):
        shutil.rmtree(tools_directory)
    os.makedirs(tools_directory)

    if os.path.isdir(non_tools_dir):
        shutil.rmtree(non_tools_dir)
    os.makedirs(non_tools_dir)

    for num, pub in zip(y_pred, test_files):
        if num:
            shutil.copy2(directory + pub, tools_directory + pub)
        else:
            shutil.copy2(directory + pub, non_tools_dir + pub)

    print "Classification:    Seconds taken: " + str(time.time() - start)
def test_sample_weight():
    """Tests sample_weight parameter of VotingClassifier"""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = SVC(probability=True, random_state=123)
    eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('svc', clf3)],
        voting='soft').fit(X, y, sample_weight=np.ones((len(y),)))
    eclf2 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('svc', clf3)],
        voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))

    sample_weight = np.random.RandomState(123).uniform(size=(len(y),))
    eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft')
    eclf3.fit(X, y, sample_weight)
    clf1.fit(X, y, sample_weight)
    assert_array_equal(eclf3.predict(X), clf1.predict(X))
    assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X))

    # check that an error is raised and indicative if sample_weight is not
    # supported.
    clf4 = KNeighborsClassifier()
    eclf3 = VotingClassifier(estimators=[
        ('lr', clf1), ('svc', clf3), ('knn', clf4)],
        voting='soft')
    msg = ('Underlying estimator KNeighborsClassifier does not support '
           'sample weights.')
    with pytest.raises(ValueError, match=msg):
        eclf3.fit(X, y, sample_weight)

    # check that _parallel_fit_estimator will raise the right error
    # it should raise the original error if this is not linked to sample_weight
    class ClassifierErrorFit(BaseEstimator, ClassifierMixin):
        def fit(self, X, y, sample_weight):
            raise TypeError('Error unrelated to sample_weight.')
    clf = ClassifierErrorFit()
    with pytest.raises(TypeError, match='Error unrelated to sample_weight'):
        clf.fit(X, y, sample_weight=sample_weight)
Beispiel #13
0
def main(path,filename):

	batchsT = ['histogramaByN','histogramaColor','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5']
	batchsAux = ['histogramaByN','histogramaColor','patronesCirculaesByN_2_5','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12']
	#batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9']
	#batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_3_5']
	#for batch in batchsAux:


	#print batch
	batchs = batchsAux
	#batchs.remove(batch)
	X = []
	y = []
	load_batch(y,path,'clases',filename) 
	y = [j for i in y for j in i]
	for batch in batchs:
		load_batch(X,path,batch,filename)
	
	#X,y = load_images('/tmp/train/')
	est = [RandomForest(),Boosting()]
	for i in xrange(0,15):
		est.append(Gradient(i))
	for i in xrange(0,4):
		est.append(SVM(i))

	#scores = cross_validation.cross_val_score(clf, X, y, cv=5)
	#print scores
	clf = VotingClassifier(estimators=est)

	clf.fit(X,y)
	pickle.dump( clf, open( "clf_grande.p", "wb" ) )
	return
	X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, y, test_size=0.2,random_state=777)
	#print clf.sub_score(X_test,Y_test)
	print 'start'
	conf_matrix = metrics.confusion_matrix(Y_test,clf.predict(X_test))
	print 'confution matrix'
	print conf_matrix
	return
	for name,estim in est:
		print name
		#estim.fit(X_train,Y_train)
		#print estim.score(X_test,Y_test)
		print cross_validation.cross_val_score(estim, X, y, cv=5,n_jobs=-1)
	print 'voter'
	print cross_validation.cross_val_score(clf, X, y, cv=5,n_jobs=-1)
	return
	#clf.fit(X_train,Y_train)
	print clf.score(X_test,Y_test)

	return
Beispiel #14
0
    def classifier(self, scoring, cv, eval_using):
        
        adaclf = AdaBoostClassifier(algorithm='SAMME')
        xtr = StandardScaler().fit_transform(self.xtr)
        xte = StandardScaler().fit_transform(self.xte)
        
        # iterate over each grid score for param tuner
        for score in scoring:
            
            print('Tuning parameters of inital classifiers...')
            passive_params = param_tuner(PassiveAggressiveClassifier(), 
                                         score=score, cv=cv, xtr=xtr, 
                                         ytr=self.ytr)
            passclf = PassiveAggressiveClassifier().set_params(**passive_params)  
            sgd_params = param_tuner(SGDClassifier(), score=score, cv=cv,
                                     xtr=xtr, ytr=self.ytr)
            sgdclf = SGDClassifier().set_params(**sgd_params)
            
            # cant use resampling/bagging with passive aggressive classifier
            # will raise ValueError: The number of class labels must be > 1
            # since resampling may results in training sets with 1 class. 
            
            print('\n'+'Tuning meta-classifiers with tuned classifier/s...') 
            bagsgd_params = param_tuner(BaggingClassifier(sgdclf), 
                                         score=score, cv=cv, xtr=xtr, 
                                         ytr=self.ytr)
            bg_sgdclf = BaggingClassifier(sgdclf).set_params(**bagsgd_params)
            
            adasgd_params = param_tuner(adaclf.set_params(base_estimator=sgdclf), 
                                        score =score, cv=cv, xtr=xtr, 
                                        ytr=self.ytr)
            ada_sgdclf = adaclf.set_params(**adasgd_params)
            
            print('Voting on meta-classifiers/classifiers then predicting...')
            vote = VotingClassifier(estimators=[('BagSGD', bg_sgdclf),
                                                ('adaboostSGD', ada_sgdclf),
                                                ('Passive', passclf)],
                                    voting='hard').fit(xtr, self.ytr)

            start = time.time()
            y_true, y_pred = self.yte, vote.predict(xte)
            print('\n' + '-'*5, 'FINAL PREDICTION RESULTS','-'*5 +'\n', 
                  '{0:.4f}'.format(time.time()-start)+'--prediction time(secs)')
                  
            clf_evaluation = report(*eval_using, y_true=y_true, y_pred=y_pred)
            for reports in clf_evaluation:
                print('---',reports)
                print(clf_evaluation[reports])
Beispiel #15
0
def do_ml(ticker):
    X, y, df = extract_featuresets(ticker)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.25)


    #clf = neighbors.KNeighborsClassifier()
    clf = VotingClassifier([('lsvc', svm.LinearSVC()),
                            ('knn', neighbors.KNeighborsClassifier()),
                            ('rfor', RandomForestClassifier())] )

    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print('Accuracy', confidence)
    predictions = clf.predict(X_test)
    print('Predicted spread:', Counter(predictions))

    return confidence
Beispiel #16
0
def combine_voting_NB_classifier(X_train, X_test, y_train, y_test,X_train_meta, X_test_meta, y_train_meta, y_test_meta):
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.neighbors import NearestCentroid
    from sklearn.ensemble import VotingClassifier

    clf_1 = BernoulliNB(alpha = 0.10000000000000001).fit(X_train_meta, y_train_meta)
    from sklearn.svm import SVC
    clf_2 = SVC(C=100, gamma=0.1).fit(X_train_meta, y_train_meta)
    clf_3 = NearestCentroid().fit(X_train_meta, y_train_meta)

    eclf = VotingClassifier(estimators=[('nb1', clf_1),('nb2', clf_3)], voting='hard')

    eclf = eclf.fit(X_train_meta, y_train_meta)
    y_voting_predicted = eclf.predict(X_test_meta)

    np.savetxt('oto_wyniki.csv',y_voting_predicted, delimiter=',')
    print "\n Here is the classification report for Voting classifier:"
    print metrics.classification_report(y_test_meta, y_voting_predicted)
Beispiel #17
0
def mutipleClf(label_clfset,data,features,votingType='soft',weight=[],testData=None,testFeatures=None):
    flag=False
    if weight==[]:
        flag=True;
    print "======================================\n"
    print ("Start at: "+time.strftime("%H:%M:%S")+"\n")
    if votingType=='soft':  
        for label_clf in label_clfset:
            #use ten fold socore,set the cv to 10
            scores = cross_validation.cross_val_score(label_clf[1], data, features, cv=10)
            if flag:
                weight.append(scores.mean())
        eclf = VotingClassifier(estimators=label_clfset, voting=votingType, weights=weight)
        
    else:
        eclf = VotingClassifier(estimators=label_clfset, voting=votingType)
    result=eclf.fit(data,features)
    accuracy=0.0
    if testData!=None:
        testResult=eclf.predict(testData)
        accuracy=getAccuracy(testResult,testFeatures)   
    print ("End at: "+time.strftime("%H:%M:%S")+"\n")
    print "======================================\n"
    return result,accuracy
svm = SVC(C=10, gamma=0.01, probability=True, random_state=3)
svm.fit(x_train, y_train)
svm_pred = svm.predict(x_test)

print(" [accuarcy]")
print("tree     : ", accuracy_score(y_test, dtree_pred))
print("random forest : ", accuracy_score(y_test, rf_pred))
print("knn      : ", accuracy_score(y_test, knn_pred))
print("svm      : ", accuracy_score(y_test, svm_pred))

# 하드 보팅
voting_clf = VotingClassifier(estimators=[('rforest', rf), ('knn', knn),
                                          ('svm', svm)],
                              weights=[1, 1, 2],
                              voting='hard').fit(x_train, y_train)
hard_voting_predicted = voting_clf.predict(x_test)
print(" ")
print(" [ensemble] ")
print(" hard voting accuracy : ", accuracy_score(y_test,
                                                 hard_voting_predicted))

# 소프트 보팅
voting_clf = VotingClassifier(estimators=[('rforest', rf), ('knn', knn),
                                          ('svm', svm)],
                              weights=[1, 1, 2],
                              voting='soft').fit(x_train, y_train)
soft_voting_predicted = voting_clf.predict(x_test)
print(" soft voting accuracy : ", accuracy_score(y_test,
                                                 soft_voting_predicted))

# 정확도 비교 시각화
Beispiel #19
0
        ("clf", best_classifier),
    ]
)

vot_clf = VotingClassifier(estimators=[("glove", glove_clf), ("linear", svm_clf)], voting="soft")

vot_clf.fit(train_data.Abstract, train_data.Stance)

#########################
#   Predict data        #
#########################


print ("Predicting labels")
print ("Time used: {}".format((time.time() - start_time) / 60.0))
predictions = vot_clf.predict(unlabelled_data.Abstract)
print predictions

#########################
#   Print distribution  #
#########################

against_c = 0
favor_c = 0
none_c = 0
for pred in predictions:
    if pred == "AGAINST":
        against_c += 1
    elif pred == "FAVOR":
        favor_c += 1
    else:
xg = XGBClassifier(n_estimators=60000, learning_rate=0.1, colsample_bytree=0.51007979, max_depth=7, min_child_weight=2)

adarf_sub = RandomForestClassifier(n_estimators=30000, max_features=65, criterion='entropy', min_samples_leaf=2, n_jobs=-1)
ada_sub = AdaBoostClassifier(base_estimator=adarf_sub, n_estimators=13, learning_rate=0.8)

vote = VotingClassifier([('rf', rf), ('et', et), ('xg', xg), ('ada', ada_sub)], voting='hard', weights=[1,1,1,2])

start_time = time.time()

vote.fit(dataX, dataY)
print("--- %.2f mins ---" % ((time.time() - start_time)/60))

os.system('say "Master, your program has finished"')

# Predict data and write to file.
vote_predict = vote.predict(test_data)

f = open("Ensemble3.csv", "w")
f.write("Id,Prediction\n")
for x in range(len(vote_predict)):
    f.write(str(x+1) + "," + str(int(vote_predict[x])) + "\n")
f.close()

os.system('say "Master, your file has been created."')
datetime.datetime.now()


#0.76014, CV: 0.712590355255
start_time = time.time()
rf = RandomForestClassifier(n_estimators=1000, max_features=65, criterion='entropy', min_samples_leaf=2, n_jobs=-1)
rf_scores = cross_val_score(rf, dataX_scaled, dataY, cv=10, n_jobs=-1)
Beispiel #21
0
training_y = training_data['CategoryNumber']    
    
    
  # create a testing and validation set from the training_data
train_x, test_x, train_y, test_y = cross_validation.train_test_split(training_x, training_y, test_size=0.1)
       
estimators = 40       
clf1 = BaggingClassifier(n_estimators=estimators)   #30.5
clf2 = ExtraTreesClassifier(n_estimators=estimators) #30.1
clf3 = RandomForestClassifier(n_estimators=estimators) #31.1


#clf = clf3
clf = VotingClassifier(estimators=[('b', clf1), ('e', clf2), ('r', clf3)], voting='soft' )
clf.fit(train_x, train_y)
predicted = clf.predict(test_x)




np.savetxt("temp_predictions.txt", predicted )

print("Correct", sum(predicted == test_y))
print("Total", len(test_y))
print("Accuracy", sum(predicted == test_y) / len(test_y) * 100.)
print("Uniques", len(np.unique(predicted)))




# sanity check  
# Tf–idf term weighting
tfidf_trans = TfidfTransformer()
tfidf_train = tfidf_trans.fit_transform(dtm_train)
tfidf_test = tfidf_trans.fit_transform(dtm_test)


# Training classifiers
clf1 = RandomForestClassifier()
clf2 = AdaBoostClassifier()
clf3 = xgb.XGBClassifier()
clf4 = KNeighborsClassifier()
clf5 = DecisionTreeClassifier()

eclf = VotingClassifier(estimators=[('rf', clf1), ('ab', clf2), ('gb', clf3), ('ls', clf4), ('dt', clf5)], voting='soft', weights=[1, 0.5, 1.5, 1, 1])
eclf.fit(dtm_train, cuisine_label)
predict_result = eclf.predict(dtm_test)


testdf['cuisine'] = le.inverse_transform(predict_result)
predict_dict = dict(zip(testdf['id'], testdf['cuisine']) )
with open('predict_result_ensemble.csv', 'w') as csvfile:
    fieldnames = ['id', 'cuisine']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for key, value in predict_dict.iteritems():
        writer.writerow({'id': key, 'cuisine': value})
print 'finished'

#for clf, label in zip([clf1, clf2, clf3, eclf], ['Random Forest', 'Adaboost', 'Xgboost']):
#    scores = cross_validation.cross_val_score(clf, tfidf_train, cuisine_label, cv=2, scoring='accuracy')
print('Model Trainng completed !!')
print('\n')
printProgressBar(100, l, prefix='Progress:', suffix='Complete', length=50)

time.sleep(1)
print('\n')
print('Running Voting classifier ... Please Wait ...')
print('\n')
printProgressBar(0, l, prefix='Progress:', suffix='Complete', length=50)

#voting classifier to determine the best classifier
eclf1 = VotingClassifier(estimators=[('lg', clf1), ('rf', clf2), ('gnb', clf3),
                                     ('knc', clf4), ('dtc', clf5)],
                         voting='soft')
eclf1 = eclf1.fit(counts_train, y_train)
pred = (eclf1.predict(counts_test))
printProgressBar(85, l, prefix='Progress:', suffix='Complete', length=50)

print('\n')
print('Classification completed !!')
print('\n')
printProgressBar(100, l, prefix='Progress:', suffix='Complete', length=50)

time.sleep(1)
print('\n')
print('Calculating Prediction Accuracy and writing output file ...')
print('\n')
printProgressBar(0, l, prefix='Progress:', suffix='Complete', length=50)
#Saving all the predicted data in csv file name predictions.csv
L1 = le.inverse_transform(pred)
pred_df = pd.DataFrame(L1)
Beispiel #24
0
# make the prediction
y_pred_bag = bagging_clf.predict(X_test)

# calculate the accuracy
accuracy = accuracy_score(y_test, y_pred_bag)
print(accuracy)

# --------------
# import packages
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB

# code starts here
nv = GaussianNB()

# fit the classifier on X_train,y_train
nv.fit(X_train, y_train)

voting_clf_soft = VotingClassifier([('lr', lr), ('rf', rf), ('nv', nv)],
                                   voting='soft')
voting_clf_soft.fit(X_train, y_train)
# make the prediction
y_pred_soft = voting_clf_soft.predict(X_test)

# calculate the accuracy
accuracy_soft = accuracy_score(y_test, y_pred_soft)
print(accuracy_soft)

# code ends here
Beispiel #25
0
import numpy as np

clf = [KNeighborsClassifier(n_neighbors=i) for i in range(1, 11)]
Multi = GaussianNB()
errort = []
error = []
#print(clf[0:2])

for j in range(0, 10):
    print(j)
    enf = VotingClassifier([('nb', Multi), ('knn', clf[j])],
                           voting='soft',
                           weights=[1, 8])
    #enf= VotingClassifier([('%d'% j, c) for c in clf][0:j],voting='hard')
    enf.fit(data.trainvector, data.trainlabels)
    errort.append(zero_one_loss(data.testlabels, enf.predict(data.testvector)))
    error.append(zero_one_loss(data.trainlabels,
                               enf.predict(data.trainvector)))
print(error, errort)

with open(
        "/home/amrita95/Desktop/Machine learning with networks/assignments/ensemble2test.txt",
        'w') as file:
    for e in errort:
        file.write("%f\n" % e)

with open(
        "/home/amrita95/Desktop/Machine learning with networks/assignments/ensemble2train.txt",
        'w') as file:
    for e in error:
        file.write("%f\n" % e)
Beispiel #26
0
from sklearn.metrics import accuracy_score
accuracy_count = accuracy_score(y_test_count, predictions_count)
print('Count Vectorized Words Accuracy:', accuracy_count)

#Ensembling acc= 0.9569
clf1 = LogisticRegression(random_state=10)
clf2 = RandomForestClassifier(n_estimators=1000,
                              max_features=17,
                              criterion='entropy',
                              random_state=0)
clf3 = GradientBoostingClassifier(n_estimators=1000, random_state=10)
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3)],
                        voting='hard')
#Ensembling predictions
eclf.fit(X_train_count, y_train_count)
predictions_count = eclf.predict(X_test_count)

# cross validation with kfold = 10
from sklearn.cross_validation import cross_val_score
accuracies = cross_val_score(estimator=eclf,
                             X=X_train_count,
                             y=y_train_count,
                             cv=10)
print('Ensemble Mean Accuracy', accuracies.max())

### Function to create confusion matrix ###
import itertools


def plot_confusion_matrix(cm,
                          classes,
Beispiel #27
0
voting.fit(X_train, y_train)

y_pred_proba = voting.predict_proba(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:, 1], pos_label=1)
auc_score = str(round(auc(fpr, tpr), 5))
print('AUC score: {}'.format(auc_score))

plot_roc_curve(fpr, tpr)
'''

Hard voting classifier

'''

voting_model = VotingClassifier(voting='hard',
                                estimators=[
                                    ('xgb', xgb_model),
                                    ('logit', lr_model),
                                    ('svm', svc_model),
                                ])

voting_model.fit(X_train, y_train)

y_pred_proba = voting.predict(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
auc_score = str(round(auc(fpr, tpr), 5))
print('AUC score: {}'.format(auc_score))
Beispiel #28
0
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
dataset=make_moons(n_samples=5000,noise=0.5)
X=dataset[0]
y=dataset[1]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
log_clf=LogisticRegression()
rnd_clf=RandomForestClassifier()
svm_clf=SVC(probability=True)
voting_clf=VotingClassifier(estimators=[('lr',log_clf),('rnd',rnd_clf),('svm',svm_clf)],voting='hard')
voting_clf.fit(X_train,y_train)

#%%
from sklearn.metrics import accuracy_score
y_pred=voting_clf.predict(X_test)
accuracy_score(y_test,y_pred)

#%%
for clf in [log_clf,rnd_clf,svm_clf]:
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test,y_pred))

#%%
# 接下来的代码训练了一个 500 个决策树分类器的集成,每一个都
# 是在数据集上有放回采样 100 个训练实例下进行训练(这是 Bagging 的例子,如果你想尝试
# Pasting,就设置 bootstrap=False  )。 n_jobs  参数告诉 sklearn 用于训练和预测所需要 CPU
# 核的数量。(-1 代表着 sklearn 会使用所有空闲核):
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
Beispiel #29
0
def optimise_train_model(X,
                         XX,
                         YY,
                         error_selector,
                         test_size=0.3,
                         print_conf_mx=True,
                         plot_final_conf_mx=True,
                         plot_all_conf_mx=True,
                         savefigs=True,
                         pickle_model=False):
    # Function splits the data into training and test sets, then tests the
    # performance of a range of models on the training data. The final model
    # selected is then evaluated using the test set. The function automatically
    # selects the model that performs best on the training sets. All
    # performance metrics are printed to ipython. The performance metric
    # to use to select the model is determined in the function call. Options
    # for error_selector are: 'accuracy', 'F1', 'recall', 'precision', and
    # 'average_all_metric'
    # The option 'plot_all_conf_mx' can be se to True or False. If True, the
    # train set confusion matrices will be plotted for all models. If False,
    # only the final model confusion matrix will be plotted.
    # X, XX, YY are the datasets with and without labels.

    # split data into test and train sets.
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
        XX, YY, test_size=test_size)

    # test different classifers and report performance metrics using traning data only

    # 1. Try Naive Bayes
    clf_NB = GaussianNB()
    clf_NB.fit(X_train, Y_train)
    accuracy_NB = clf_NB.score(X_train, Y_train)  # calculate accuracy
    Y_predict_NB = clf_NB.predict(X_train)  # make nre prediction
    conf_mx_NB = confusion_matrix(Y_train,
                                  Y_predict_NB)  # calculate confusion matrix
    recall_NB = recall_score(Y_train, Y_predict_NB, average="weighted")
    f1_NB = f1_score(Y_train, Y_predict_NB,
                     average="weighted")  # calculate f1 score
    precision_NB = precision_score(Y_train, Y_predict_NB, average='weighted')
    average_metric_NB = (accuracy_NB + recall_NB + f1_NB) / 3

    # 2. Try K-nearest neighbours
    clf_KNN = neighbors.KNeighborsClassifier()
    clf_KNN.fit(X_train, Y_train)
    accuracy_KNN = clf_KNN.score(X_train, Y_train)
    Y_predict_KNN = clf_KNN.predict(X_train)
    conf_mx_KNN = confusion_matrix(Y_train, Y_predict_KNN)
    recall_KNN = recall_score(Y_train, Y_predict_KNN, average="weighted")
    f1_KNN = f1_score(Y_train, Y_predict_KNN, average="weighted")
    precision_KNN = precision_score(Y_train, Y_predict_KNN, average='weighted')
    average_metric_KNN = (accuracy_KNN + recall_KNN + f1_KNN) / 3

    # 3. Try support Vector Machine with best params calculated using
    # GridSearch cross validation optimisation
    tuned_parameters = [{
        'kernel': ['linear'],
        'gamma': [1e-1, 1e-2, 1e-3, 1e-4],
        'C': [0.1, 1, 10, 100, 1000, 10000]
    }, {
        'kernel': ['rbf'],
        'gamma': [1e-1, 1e-2, 1e-3, 1e-4],
        'C': [0.1, 1, 10, 100, 1000, 10000]
    }, {
        'kernel': ['poly'],
        'gamma': [1e-1, 1e-2, 1e-3, 1e-4],
        'C': [0.1, 1, 10, 100, 1000, 10000]
    }, {
        'kernel': ['sigmoid'],
        'gamma': [1e-1, 1e-2, 1e-3, 1e-4],
        'C': [0.1, 1, 10, 100, 1000, 10000]
    }]

    clf_svm = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=3)
    clf_svm.fit(X_train, Y_train)

    print()
    print("Best parameters set found on development set:")
    print(clf_svm.best_params_)
    print()  # line break

    kernel = clf_svm.best_estimator_.get_params()['kernel']
    C = clf_svm.best_estimator_.get_params()['C']
    gamma = clf_svm.best_estimator_.get_params()['gamma']

    clf_svm = svm.SVC(kernel=kernel, C=C, gamma=gamma, probability=True)
    clf_svm.fit(X_train, Y_train)
    accuracy_svm = clf_svm.score(X_train, Y_train)
    Y_predict_svm = clf_svm.predict(X_train)
    conf_mx_svm = confusion_matrix(Y_train, Y_predict_svm)
    recall_svm = recall_score(Y_train, Y_predict_svm, average="weighted")
    f1_svm = f1_score(Y_train, Y_predict_svm, average="weighted")
    precision_svm = precision_score(Y_train, Y_predict_svm, average='weighted')
    average_metric_svm = (accuracy_svm + recall_svm + f1_svm) / 3

    # 4. Try  a random forest classifier
    clf_RF = RandomForestClassifier(n_estimators=1000,
                                    max_leaf_nodes=16,
                                    n_jobs=-1)
    clf_RF.fit(X_train, Y_train)
    accuracy_RF = clf_RF.score(X_train, Y_train)
    Y_predict_RF = clf_RF.predict(X_train)
    conf_mx_RF = confusion_matrix(Y_train, Y_predict_RF)
    recall_RF = recall_score(Y_train, Y_predict_RF, average="weighted")
    f1_RF = f1_score(Y_train, Y_predict_RF, average="weighted")
    precision_RF = precision_score(Y_train, Y_predict_RF, average='weighted')
    average_metric_RF = (accuracy_RF + recall_RF + f1_RF) / 3

    # 5. Try an ensemble of all the other classifiers (not RF) using the voting classifier method
    ensemble_clf = VotingClassifier(estimators=[('NB', clf_NB),
                                                ('KNN', clf_KNN),
                                                ('svm', clf_svm),
                                                ('RF', clf_RF)],
                                    voting='hard')
    ensemble_clf.fit(X_train, Y_train)
    accuracy_ensemble = ensemble_clf.score(X_train, Y_train)
    Y_predict_ensemble = ensemble_clf.predict(X_train)
    conf_mx_ensemble = confusion_matrix(Y_train, Y_predict_ensemble)
    recall_ensemble = recall_score(Y_train,
                                   Y_predict_ensemble,
                                   average="weighted")
    f1_ensemble = f1_score(Y_train, Y_predict_ensemble, average="weighted")
    precision_ensemble = precision_score(Y_train,
                                         Y_predict_ensemble,
                                         average='weighted')
    average_metric_ensemble = (accuracy_ensemble + recall_ensemble +
                               f1_ensemble) / 3

    print()
    print('*** MODEL TEST SUMMARY ***')
    print('KNN accuracy = ', accuracy_KNN, 'KNN_F1_Score = ', f1_KNN,
          'KNN Recall = ', recall_KNN, 'KNN precision = ', precision_KNN)
    print('Naive Bayes accuracy = ', accuracy_NB, 'Naive_Bayes_F1_Score = ',
          f1_NB, 'Naive Bayes Recall = ', recall_NB,
          'Naive Bayes Precision = ', precision_NB)
    print('SVM accuracy = ', accuracy_svm, 'SVM_F1_Score = ', f1_svm,
          'SVM recall = ', recall_svm, 'SVM Precision = ', precision_svm)
    print('Random Forest accuracy', accuracy_RF, 'Random Forest F1 Score = ',
          f1_RF, 'Random Forest Recall', recall_RF,
          'Random Forest Precision = ', precision_RF)
    print('Ensemble accuracy', accuracy_ensemble, 'Ensemble F1 Score = ',
          f1_ensemble, 'Ensemble Recall', recall_ensemble,
          'Ensemble Precision = ', precision_ensemble)

    # PLOT CONFUSION MATRICES

    if plot_all_conf_mx:

        fig = plt.figure(figsize=(15, 15))
        ax1 = fig.add_subplot(321)
        ax1.imshow(conf_mx_NB), plt.title(
            'NB Model Confusion Matrix'), plt.colorbar
        classes = clf_NB.classes_
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes, rotation=45)

        ax2 = fig.add_subplot(322)
        ax2.imshow(conf_mx_KNN, cmap=plt.cm.gray), plt.title(
            'KNN Model Confusion Matrix'), plt.colorbar,
        classes = clf_svm.classes_
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes, rotation=45)

        ax3 = fig.add_subplot(323)
        ax3.imshow(
            conf_mx_svm,
            cmap=plt.cm.gray), plt.title('SVM Confusion Matrix'), plt.colorbar,
        classes = clf_svm.classes_
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes, rotation=45)

        ax4 = fig.add_subplot(324)
        ax4.imshow(conf_mx_RF, cmap=plt.cm.gray), plt.title(
            'Random Forest Confusion Matrix'), plt.colorbar,
        classes = clf_svm.classes_
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes, rotation=45)

        ax5 = fig.add_subplot(325)
        ax5.imshow(conf_mx_ensemble, cmap=plt.cm.gray), plt.title(
            'voting Ensemble Confusion Matrix'), plt.colorbar,
        classes = clf_svm.classes_
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes, rotation=45)

        plt.tight_layout()

        if savefigs:
            plt.savefig(str(savefig_path + 'confusion_matrices.jpg'))

        plt.show()

    print()  # line break

    if error_selector == 'accuracy':

        if accuracy_KNN > accuracy_svm and accuracy_KNN > accuracy_NB and accuracy_KNN > accuracy_RF and accuracy_KNN >\
                accuracy_ensemble:
            clf = neighbours.KNeighboursClassifier()
            clf.fit(X_train, Y_train)
            print('KNN model chosen')

        elif accuracy_NB > accuracy_KNN and accuracy_NB > accuracy_svm and accuracy_NB > accuracy_RF and accuracy_NB > \
                accuracy_ensemble:
            clf = clf_NB
            clf.fit(X_train, Y_train)
            print('Naive Bayes model chosen')

        elif accuracy_svm > accuracy_NB and accuracy_svm > accuracy_KNN and accuracy_KNN > accuracy_RF and accuracy_svm\
                > accuracy_ensemble:
            clf = clf_svm
            clf.fit(X_train, Y_train)
            print('SVM model chosen')
            print('SVM Params: C = ', C, ' gamma = ', gamma, ' kernel = ',
                  kernel)

        elif accuracy_RF > accuracy_NB and accuracy_RF > accuracy_KNN and accuracy_RF > accuracy_ensemble and \
                accuracy_RF > accuracy_svm:
            clf = clf_RF
            clf.fit(X_train, Y_train)
            print('RF model chosen')

        elif accuracy_ensemble > accuracy_svm and accuracy_ensemble > accuracy_NB and accuracy_ensemble > accuracy_RF \
                and accuracy_ensemble > accuracy_KNN:
            clf = clf_ensemble
            clf.fit(X_train, Y_train)
            print('Ensemble model chosen')

    elif error_selector == 'recall':

        if recall_KNN > recall_svm and recall_KNN > recall_NB and recall_KNN > recall_RF and recall_KNN > \
                recall_ensemble:
            clf = neighbours.KNeighboursClassifier()
            clf.fit(X_train, Y_train)
            print('KNN model chosen')

        elif recall_NB > recall_KNN and recall_NB > recall_svm and recall_NB > recall_RF and recall_NB > \
                recall_ensemble:
            clf = GaussianNB()
            clf.fit(X_train, Y_train)
            print('Naive Bayes model chosen')

        elif recall_svm > recall_NB and recall_svm > recall_KNN and recall_svm > recall_RF and recall_svm > \
                recall_ensemble:
            clf = svm.SVC(kernel=kernel, C=C, gamma=gamma, probability=True)
            clf.fit(X_train, Y_train)
            print('SVM model chosen')
            print('SVM Params: C = ', C, ' gamma = ', gamma, ' kernel = ',
                  kernel)

        elif recall_RF > recall_NB and recall_RF > recall_KNN and recall_RF > recall_ensemble and \
                recall_RF > recall_svm:
            clf = clf_RF
            clf.fit(X_train, Y_train)
            print('RF model chosen')


        elif recall_ensemble > recall_svm and recall_ensemble > recall_NB and recall_NB > recall_RF and \
                recall_ensemble > recall_KNN:
            clf = VotingClassifier(estimators=[('NB', clf_NB),
                                               ('SVM', clf_svm),
                                               ('KNN', clf_KNN)],
                                   voting='hard')
            clf.fit(X_train, Y_train)
            print('Ensemble model chosen')

    elif error_selector == 'F1':

        if f1_KNN > f1_svm and f1_KNN > f1_NB and f1_KNN > f1_RF and f1_KNN > f1_ensemble:
            clf = neighbours.KNeighboursClassifier()
            clf.fit(X_train, Y_train)
            print('KNN model chosen')

        elif f1_NB > f1_KNN and f1_NB > f1_svm and f1_NB > f1_RF and f1_NB > f1_ensemble:
            clf = GaussianNB()
            clf.fit(X_train, Y_train)
            print('Naive Bayes model chosen')

        elif f1_svm > f1_NB and f1_svm > f1_KNN and f1_svm > f1_RF and f1_svm > f1_ensemble:
            clf = svm.SVC(kernel=kernel, C=C, gamma=gamma, probability=True)
            clf.fit(X_train, Y_train)
            print('SVM model chosen')
            print('SVM Params: C = ', C, ' gamma = ', gamma, ' kernel = ',
                  kernel)

        elif f1_RF > f1_NB and f1_RF > f1_KNN and f1_RF > f1_ensemble and f1_RF > f1_svm:
            clf = clf_RF
            clf.fit(X_train, Y_train)
            print('RF model chosen')

        elif f1_ensemble > f1_svm and f1_ensemble > f1_NB and f1_ensemble > f1_RF and f1_ensemble > f1_KNN:
            clf = VotingClassifier(estimators=[('NB', clf_NB),
                                               ('SVM', clf_svm),
                                               ('KNN', clf_KNN)],
                                   voting='hard')
            clf.fit(X_train, Y_train)
            print('Ensemble model chosen')

    elif error_selector == 'precision':

        if precision_KNN > precision_svm and precision_KNN > precision_NB and precision_KNN > precision_RF \
                and precision_KNN > precision_ensemble:
            clf = neighbours.KNeighboursClassifier()
            clf.fit(X_train, Y_train)
            print('KNN model chosen')

        elif precision_NB > precision_KNN and precision_NB > precision_svm and precision_NB > precision_RF \
                and precision_NB > precision_ensemble:
            clf = GaussianNB()
            clf.fit(X_train, Y_train)
            print('Naive Bayes model chosen')

        elif precision_RF > precision_NB and precision_RF > precision_KNN and precision_RF > precision_ensemble \
                and precision_RF > precision_svm:
            clf = clf_RF
            clf.fit(X_train, Y_train)
            print('RF model chosen')

        elif precision_svm > precision_NB and precision_svm > precision_KNN and precision_svm > precision_RF \
                and precision_svm > precision_ensemble:
            clf = svm.SVC(kernel=kernel, C=C, gamma=gamma, probability=True)
            clf.fit(X_train, Y_train)
            print('SVM model chosen')
            print('SVM Params: C = ', C, ' gamma = ', gamma, ' kernel = ',
                  kernel)

        elif precision_ensemble > precision_svm and precision_ensemble > precision_NB and precision_ensemble > \
                precision_RF and precision_ensemble > precision_KNN:
            clf = VotingClassifier(estimators=[('NB', clf_NB),
                                               ('SVM', clf_svm),
                                               ('KNN', clf_KNN)],
                                   voting='hard')
            clf.fit(X_train, Y_train)
            print('Ensemble model chosen')

    elif error_selector == 'average_all_metric':

        if average_metric_KNN > average_metric_svm and average_metric_KNN > average_metric_NB and average_metric_KNN > \
                average_metric_RF and average_metric_KNN > average_metric_ensemble:
            clf = neighbours.KNeighboursClassifier()
            clf.fit(X_train, Y_train)
            print('KNN model chosen')

        elif average_metric_NB > average_metric_KNN and average_metric_NB > average_metric_svm and average_metric_NB > \
                average_metric_RF and average_metric_NB > average_metric_ensemble:
            clf = GaussianNB()
            clf.fit(X_train, Y_train)
            print('Naive Bayes model chosen')

        elif average_metric_RF > average_metric_NB and average_metric_RF > average_metric_KNN and average_metric_RF > \
                average_metric_ensemble and average_metric_RF > average_metric_svm:
            clf = clf_RF
            clf.fit(X_train, Y_train)
            print('RF model chosen')

        elif average_metric_svm > average_metric_NB and average_metric_svm > average_metric_KNN and average_metric_svm \
                > average_metric_RF and average_metric_svm > average_metric_ensemble:
            clf = svm.SVC(kernel=kernel, C=C, gamma=gamma, probability=True)
            clf.fit(X_train, Y_train)
            print('SVM model chosen')
            print('SVM Params: C = ', C, ' gamma = ', gamma, ' kernel = ',
                  kernel)

        elif average_metric_ensemble > average_metric_svm and average_metric_ensemble > average_metric_NB and \
                average_metric_ensemble > average_metric_RF and average_metric_ensemble > average_metric_KNN:
            clf = VotingClassifier(estimators=[('NB', clf_NB),
                                               ('SVM', clf_svm),
                                               ('KNN', clf_KNN)],
                                   voting='hard')
            clf.fit(X_train, Y_train)
            print('Ensemble model chosen')

    # Now that model has been selected using error metrics from training data, the final
    # model can be evaluated on the test set. The code below therefore measures the f1, recall,
    # confusion matrix and accuracy  for the final selected model and prints to console.

    Y_test_predicted = clf.predict(X_test)
    final_conf_mx = confusion_matrix(Y_test, Y_test_predicted)

    # calculate normalised confusion matrix
    row_sums = final_conf_mx.sum(axis=1, keepdims=True)
    norm_conf_mx = final_conf_mx / row_sums
    np.fill_diagonal(norm_conf_mx, 0)

    # plot confusion matrices as subplots in a single figure

    if plot_final_conf_mx == True:

        fig = plt.figure(figsize=(10, 10))
        ax1 = fig.add_subplot(211)
        ax1.imshow(final_conf_mx), plt.title(
            'Final Confusion Matrix'), plt.colorbar
        classes = clf.classes_
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes, rotation=45)

        ax2 = fig.add_subplot(212)
        ax2.imshow(norm_conf_mx, cmap=plt.cm.gray), plt.title(
            'Normalised Confusion Matrix'), plt.colorbar,
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes, rotation=45)

        plt.tight_layout()

        if savefigs:
            plt.savefig(
                str(savefig_path + "final_model_confusion_matrices.jpg"))

        plt.show()

    # calculate performance measures for final model
    final_recall = recall_score(Y_test, Y_test_predicted, average="weighted")
    final_f1 = f1_score(Y_test, Y_test_predicted, average="weighted")
    final_accuracy = clf.score(X_test, Y_test)
    final_precision = precision_score(Y_test,
                                      Y_test_predicted,
                                      average='weighted')
    final_average_metric = (final_recall + final_accuracy + final_f1) / 3

    if print_conf_mx:
        print('Final Confusion Matrix')
        print(final_conf_mx)
        print()
        print('Normalised Confusion Matrix')
        print(norm_conf_mx)

    # The Feature importances
    print()
    print('Feature Importances')
    print('(relative importance of each feature (wavelength) for prediction)')
    print()
    for name, score in zip(X.columns, clf.feature_importances_):
        print(name, score)

    print()  # line break
    print('*** FINAL MODEL SUMMARY ***')
    print('Final Model Accuracy = ', final_accuracy)
    print('Final Model Recall = ', final_recall)
    print('Final Model F1 = ', final_f1)
    print('Final Model Precision = ', final_precision)
    print('Final Model Average metric = ', final_average_metric)

    if pickle_model:
        # pickle the classifier model for archiving or for reusing in another code
        joblibfile = 'UAV_classifier.pkl'
        joblib.dump(clf, joblibfile)

        # to load this classifier into another code use the following syntax:
        # clf = joblib.load(joblib_file)

    return clf, final_conf_mx, norm_conf_mx
#predict using only adaboost

sev = []
for i in range(2500):
    lt = []
    for k in range(10):
        lt.append(X_final[i][k])
    ans = ada_best.predict(sc.transform(np.array([lt])))

    sev.append((ids[i], ans[0]))

answers3 = []

for i in range(2500):
    answers3.append(sev[i][1])

#predict using ensemble

sev = []
for i in range(2500):
    lt = []
    for k in range(10):
        lt.append(X_final[i][k])
    ans = votingC.predict(sc.transform(np.array([lt])))

    sev.append((ids[i], ans[0]))

answers4 = []

for i in range(2500):
    answers4.append(sev[i][1])
def the_voting(N, X_ALL, X_ALL_val, y_ALL, y_ALL_val, pipe_list, DS, CLF):
    y_ALL = y_ALL.ravel()
    y_ALL_val = y_ALL_val.ravel()

    if N == 3:
        eclf = VotingClassifier(estimators=[
            (CLF[0] + '+' + DS[0], pipe_list[0]),
            (CLF[1] + '+' + DS[1], pipe_list[1]),
            (CLF[2] + '+' + DS[2], pipe_list[2])
        ],
                                voting='hard',
                                n_jobs=-1)
        print(' Sanity check [Ensemble]')
        print(' -> ALL Together')
        eclf.fit(X_ALL, y_ALL)
        print(eclf.score(X_ALL_val, y_ALL_val))

        y_true, y_pred = y_ALL_val, eclf.predict(X_ALL_val)
        CM = confusion_matrix(y_true, y_pred)
        report = classification_report(y_true, y_pred)
        print(report)
        print(CM)
        print(' --> Validation Accuracy:')
        e_val_score = accuracy_score(y_true, y_pred)
        print('    ' + "{:.2%}".format(accuracy_score(y_true, y_pred)))

        acc = []
        labels = []
        ## THIS IS FROM sklearn. (sidenote)
        for clf__, label in zip(
            [pipe_list[0], pipe_list[1], pipe_list[2], eclf], [
                DS[0] + '+' + CLF[0], DS[1] + '+' + CLF[1],
                DS[2] + '+' + CLF[2], 'Ensemble'
            ]):
            scores = cross_val_score(clf__,
                                     X_ALL,
                                     y_ALL,
                                     scoring='accuracy',
                                     cv=5)
            print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
                  (scores.mean(), scores.std(), label))
            acc.append(scores.mean())
            labels.append(label)

        return eclf, eclf.score(
            X_ALL_val, y_ALL_val), acc, labels, e_val_score, CM, report

    else:  ## N==2:
        # ensemble/voting classifier where clf1 fitted with df1 and clf2 fitted with df2
        eclf = VotingClassifier(estimators=[
            (CLF[0] + '+' + DS[0], pipe_list[0]),
            (CLF[1] + '+' + DS[1], pipe_list[1])
        ],
                                voting='hard',
                                n_jobs=-1)
        print(' Sanity check [Ensemble]')
        print(' -> ALL Together')
        eclf.fit(X_ALL, y_ALL)
        print(eclf.score(X_ALL_val, y_ALL_val))

        y_true, y_pred = y_ALL_val, eclf.predict(X_ALL_val)
        CM = confusion_matrix(y_true, y_pred)
        report = classification_report(y_true, y_pred)
        print(report)
        print(CM)
        print(' --> Validation Accuracy:')
        e_val_score = accuracy_score(y_true, y_pred)
        print('    ' + "{:.2%}".format(accuracy_score(y_true, y_pred)))

        acc = []
        labels = []
        ## THIS IS FROM sklearn. (sidenote)
        for clf__, label in zip(
            [pipe_list[0], pipe_list[1], eclf],
            [DS[0] + '+' + CLF[0], DS[1] + '+' + CLF[1], 'Ensemble']):
            scores = cross_val_score(clf__,
                                     X_ALL,
                                     y_ALL,
                                     scoring='accuracy',
                                     cv=5)
            print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
                  (scores.mean(), scores.std(), label))
            acc.append(scores.mean())
            labels.append(label)

        return eclf, eclf.score(
            X_ALL_val, y_ALL_val), acc, labels, e_val_score, CM, report
Beispiel #32
0
Logistic hyper- C, penalty , multiclass 
SVC- C, kernel, gamma


6. Voting Classifier uses ensemble technique to combine multiple predictors(Same data multiple estimators)

from sklearn.ensemble import VotingClassifier
lr = LogisticRegression(random_state=SEED)
knn = KNN(n_neighbors=27)
dt = DecisionTreeClassifier(min_samples_leaf=0.13, random_state=SEED)
# Define the list classifiers
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)]
vc = VotingClassifier(estimators=classifiers)  
vc.fit(X_train, y_train)   
y_pred = vc.predict(X_test)# pass the list of tuples that have the indivisual estimators


7. BaggingClassifier (same estimator , multiple dataset created using bootstrap aggregation)

dt = DecisionTreeClassifier(random_state=1)
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1)
bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)
acc_test = accuracy_score(y_test, y_pred)
print('Test set accuracy of bc: {:.2f}'.format(acc_test)) 


8. You can estimate the performance of the enseble model using Out of Bag instances as on an average 63% of 
training samples are sampled at any time and 37% constitute the OOB instances when using bootstraping . 
The model gets trained on the bootstraped samples and evaluated on the OOB and then you average out the OOB 
                        voting='hard').set_params(n_jobs=10).fit(train_data[train_data.columns[2:]], train_data['cancer_type_id'])

clfs = [clf1, clf2, clf3, clf3, clf5]

for i in [1, 2, 3, 4, 5, 6]:
    locals()["predict{}_valid_proba".format(i)] = locals()["clf{}".format(i)].predict_proba(valid_data[train_data.columns[2:]])
    locals()["classifier{}_valid".format(i)] = pd.DataFrame(locals()["predict{}_valid_proba".format(i)], index = valid_data.index)
    locals()["classifier{}_valid".format(i)].insert(0 , "true_type", valid_data['cancer_type_id'])
    
    locals()["predict{}_test".format(i)] = locals()["clf{}".format(i)].predict(test_data[train_data.columns[2:]])
    locals()["accuracy{}_test".format(i)] = (locals()["predict{}_test".format(i)] == test_data['cancer_type_id']).mean()
    locals()["predict_proba{}".format(i)] = locals()["clf{}".format(i)].predict_proba(test_data[train_data.columns[2:]])
    locals()["classifier{}_test".format(i)] = pd.DataFrame(locals()["predict_proba{}".format(i)], index = test_data.index)
    locals()["classifier{}_test".format(i)].insert(0 , "true_type", test_data['cancer_type_id'])
    
predict7_test = clf7.predict(test_data[train_data.columns[2:]])
accuracy7_test = (predict7_test == test_data['cancer_type_id']).mean()


## the weight of performance weighted voting
from sklearn.preprocessing import LabelBinarizer 

encoder = LabelBinarizer(sparse_output = False)

valid_one_hot = encoder.fit_transform(valid_data['cancer_type_id'])
valid_one_hot = pd.DataFrame(valid_one_hot, index = valid_data.index)

test_one_hot = encoder.fit_transform(test_data['cancer_type_id'])
test_one_hot = pd.DataFrame(test_one_hot, index = test_data.index)

def main(logger=None):
    ''' Main routine to call the entire process flow '''

    # Load_Dataset --- Process starts

    logger.info(f'')
    logger.info(f'{"-"*20} Load dataset starts here {"-"*20}')
    logger.info(f'')

    # TODO: DONE; Load Cancer dataset;

    cancer_data_dict = datasets.load_breast_cancer()
    cancer_data_pd = convert2pandas_df(
        x_array=cancer_data_dict['data'],
        y=[
            cancer_data_dict['target_names'][i]
            for i in cancer_data_dict['target']
        ],
        # feature_names=iris_dict['feature_names'],
        feature_names=list(cancer_data_dict['feature_names']),
        target_name='Target')

    # logger.info(f'{cancer_data_pd.head()}');

    sns.lmplot(x="area error",
               y="compactness error",
               data=cancer_data_pd,
               fit_reg=False,
               hue='Target',
               legend=False,
               palette=dict(malignant="#BF0C2B", benign="#02173E"))
    # , versicolor="#F5900E"));
    plt.legend(loc='lower right')
    chart_save_image(plt=plt,
                     f_size=(8, 8),
                     left=0.125,
                     right=0.9,
                     bottom=0.125,
                     top=0.9,
                     wspace=0.0,
                     hspace=0.0,
                     fileName='./Cancer_Data_Plot.png')

    selected_columns = [
        'Target', 'mean radius', 'mean texture', 'mean perimeter', 'mean area',
        'mean smoothness', 'mean compactness', 'mean concavity',
        'mean concave points', 'mean symmetry'
    ]

    g = sns.pairplot(cancer_data_pd[selected_columns],
                     hue="Target",
                     diag_kind="kde",
                     palette=dict(malignant="#BF0C2B", benign="#02173E"),
                     diag_kws=dict(shade=True))
    for i, j in zip(*np.triu_indices_from(g.axes, 1)):
        g.axes[i, j].set_visible(False)
    chart_save_image(plt=plt,
                     f_size=(16, 16),
                     left=0.05,
                     right=0.97,
                     bottom=0.05,
                     top=0.97,
                     wspace=0.02,
                     hspace=0.02,
                     fileName='./Cancer_Data_PairPlot.png')

    logger.info(f'')
    logger.info(f'{"-"*20}  Load dataset ends here {"-"*20}')
    logger.info(f'')

    # Load_Dataset --- Process ends

    # __Placeholder__ --- Process Starts

    # TODO: DONE; 001; Train test split; stratified;
    X_train, X_test, y_train, y_test = train_test_split(
        cancer_data_pd[cancer_data_dict.feature_names],
        # cancer_data_pd['Target'],
        cancer_data_dict['target'
                         ],  # Has to be binary for scorer F1 and Percision;
        test_size=0.20,
        # stratify=cancer_data_pd['Target'],
        stratify=cancer_data_dict['target'],
        random_state=111,
        shuffle=True)

    logger.info(f'X_train.shape : {X_train.shape}')
    logger.info(f'X_test.shape  : {X_test.shape}')
    logger.info(f'Y_train.shape : {y_train.shape}')
    logger.info(f'Y_test.shape  : {y_test.shape}')

    # TODO: DONE; 002; Dummy Classifier ;

    # dummy_classifier = DummyClassifier(strategy="stratified");
    dummy_classifier = DummyClassifier(strategy="most_frequent")

    # TODO: DONE; 003; Cross_over_score and predict and Metrics (make_scorer)

    accuracy_scorer = make_scorer(cost_accuracy, greater_is_better=True)

    kfold = model_selection.KFold(n_splits=10, random_state=111)
    # results = model_selection.cross_val_score(dummy_classifier, X_train, y_train, cv=kfold, scoring='accuracy');
    # logger.info(f'{results} {np.mean(results)} {np.var(results)} {np.std(results)}');

    results = model_selection.cross_val_score(dummy_classifier,
                                              X_train,
                                              y_train,
                                              cv=kfold,
                                              scoring=accuracy_scorer)
    logger.info(
        f'{results} {np.mean(results)} {np.var(results)} {np.std(results)}')

    DummyClassifier_mean = np.mean(results)

    # TODO: DONE; 004; Standardization ;

    # std_scaler = preprocessing.StandardScaler();  # Contains the negative values
    std_scaler = preprocessing.MinMaxScaler()
    # Range between 0 to 1; No negative terms;
    std_scaler = std_scaler.fit(X_train)
    scaled_X_train = pd.DataFrame(std_scaler.transform(X_train),
                                  columns=X_train.columns)

    logger.info(f'{X_train["mean radius"].describe()}')
    logger.info(f'{scaled_X_train["mean radius"].describe()}')

    # TODO: DONE; 005; SelectKBest; Feature selection ;

    # selectKbest_est = SelectKBest(chi2, k=4); f_classif
    selectKbest_est = SelectKBest(f_classif, k=8)
    selectKbest_X_train = selectKbest_est.fit_transform(X_train, y_train)

    logger.info(f'{selectKbest_est.get_params(deep=True)}')
    logger.info(f'{selectKbest_est.get_support(indices=False)}')
    logger.info(f'{selectKbest_est.get_support(indices=True)}')
    logger.info(
        f'{X_train.columns[selectKbest_est.get_support(indices=True)]}')

    # TODO: DONE; 006; Polynomial Features ;

    poly = preprocessing.PolynomialFeatures(degree=2,
                                            include_bias=False,
                                            interaction_only=False)
    X_train_poly = poly.fit_transform(X_train)
    X_train_p2 = pd.DataFrame(X_train_poly,
                              columns=poly.get_feature_names(X_train.columns))

    lr = linear_model.LogisticRegression(fit_intercept=False, random_state=111)
    results = model_selection.cross_val_score(lr,
                                              X_train_p2,
                                              y_train,
                                              cv=kfold,
                                              scoring=accuracy_scorer)
    # , verbose=True);

    imp_percentage = round(
        (np.mean(results) - DummyClassifier_mean) / DummyClassifier_mean, 4)

    logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}')
    logger.info(f'LogisticRegression accuracy : {np.mean(results)}')

    logger.info(
        f'The improvement over the DummyClassifier is : {imp_percentage}')

    # TODO: DONE; 007; Kernel PCA ;

    # kernel_param = ('rbf', 0.25);
    kernel_param = ('rbf', 1)

    kpca = KernelPCA(n_components=4,
                     kernel=kernel_param[0],
                     gamma=kernel_param[1],
                     fit_inverse_transform=True,
                     random_state=111)  # n_jobs=-1,
    kpca.fit(scaled_X_train)
    # The data has to be scaled;
    kpca_X_train = kpca.transform(scaled_X_train)

    lr = linear_model.LogisticRegression(fit_intercept=False, random_state=111)
    results = model_selection.cross_val_score(lr,
                                              kpca_X_train,
                                              y_train,
                                              cv=kfold,
                                              scoring=accuracy_scorer)
    # , verbose=True);

    imp_percentage = round(
        (np.mean(results) - DummyClassifier_mean) / DummyClassifier_mean, 4)

    logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}')
    logger.info(f'LogisticRegression accuracy : {np.mean(results)}')

    logger.info(
        f'The improvement over the DummyClassifier is : {imp_percentage}')

    # TODO: DONE; 008; Grid-Search ;

    # tuned_parameters = [{
    #                      'n_estimators' : [1, 10, 100, 500, 1000, 2000],
    #                      'max_depth' : [10, 20],
    #                      'max_features' : [0.80, 0.40],
    #                      'random_state' : [111]
    #                      }];

    tuned_parameters = [{
        'n_estimators': [1, 10],
        'max_depth': [10, 20],
        'max_features': [0.80, 0.40],
        'random_state': [111]
    }]

    clf = GridSearchCV(RandomForestClassifier(),
                       tuned_parameters,
                       cv=5,
                       scoring=accuracy_scorer)
    clf.fit(X_train, y_train)

    logger.info(
        f'Best parameters set found on development set: {clf.best_score_} {clf.best_params_}'
    )
    logger.info('')
    logger.info('Grid scores on development set:')
    logger.info('')
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        logger.info(f'{round(mean,3)} (+/-{round(std*2,2)}) for {params}')
    logger.info('')

    logger.info('Detailed classification report:')
    logger.info('')
    logger.info('The model is trained on the full development set.')
    logger.info('The scores are computed on the full evaluation set.')
    logger.info('')
    y_true, y_pred = y_test, clf.predict(X_test)
    logger.info(f'{metrics.classification_report(y_true, y_pred)}')
    logger.info('')

    imp_percentage = round(
        (clf.best_score_ - DummyClassifier_mean) / DummyClassifier_mean, 4)
    logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}')
    logger.info(
        f'GridSearchCV RandomForestClassifier accuracy : {clf.best_score_}')
    logger.info(
        f'The improvement over the DummyClassifier is : {imp_percentage}')

    # logger.info(f'{clf.best_estimator_}');

    # TODO: DONE; 009; Customer Transformer for the pipeline ;
    # reference : https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/
    # http://philipmgoddard.com/modeling/sklearn_pipelines

    ctf = ColumnTypeFilter(np.number)
    ctf.fit_transform(X_train).head()

    # TODO: YTS; 010; Pipeline ;

    custom_pipeline = make_pipeline(
        FeatureUnion(
            transformer_list=[('StdScl',
                               make_pipeline(ColumnTypeFilter(np.number),
                                             preprocessing.StandardScaler())),
                              ('MMScl',
                               make_pipeline(ColumnTypeFilter(np.number),
                                             preprocessing.MinMaxScaler()))]))

    custom_pipeline.fit(X_train)
    X_test_transformed = custom_pipeline.transform(X_test)

    logger.info(
        f'{X_test.shape} {type(X_test_transformed)} {X_test_transformed.shape}'
    )

    # TODO: DONE; 011; Ensemble (VotingClassifier) and BaseClone;

    ensemble_clf = VotingClassifier(
        estimators=[
            ('dummy', dummy_classifier),
            ('logistic', lr),
            # ('supportvector', SVC(probability=True)),
            ('randomforest', RandomForestClassifier())
        ],
        voting='soft')

    ensemble_clf.fit(X_train, y_train)
    ensemble_clf_accuracy_ = cost_accuracy(y_test,
                                           ensemble_clf.predict(X_test))

    imp_percentage = round(
        (ensemble_clf_accuracy_ - DummyClassifier_mean) / DummyClassifier_mean,
        4)
    logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}')
    logger.info(
        f'GridSearchCV RandomForestClassifier accuracy : {ensemble_clf_accuracy_}'
    )
    logger.info(
        f'The improvement over the DummyClassifier is : {imp_percentage}')

    # TODO: DONE; 012; One-hot encoder; Label Encoder; Binary Encoder;

    baby_names = ['Ava', 'Lily', 'Noah', 'Jacob', 'Mia', 'Sophia']
    X_train_list = [np.random.choice(baby_names) for i in range(40)]
    X_test_list = [np.random.choice(baby_names) for i in range(6)]

    bb_labelencoder = preprocessing.LabelEncoder()
    bb_labelencoder.fit(X_train_list)
    bb_encoded = bb_labelencoder.transform(X_test_list)

    bb_onehotencoder = preprocessing.OneHotEncoder(sparse=False)
    bb_encoded = bb_encoded.reshape(len(bb_encoded), 1)
    bb_onehot = bb_onehotencoder.fit_transform(bb_encoded)

    for i, v in enumerate(X_test_list):
        logger.info(
            f'Actual : {v} \t | LabelEncoded : {bb_encoded[i][0]} \t | OneHot : {bb_onehot[i]}'
        )

    # TODO: DONE; 013; Feature Extraction from image and text;

    corpus = [
        'This is the first document.',
        'This document is the second document.',
        'And this is the third one.',
        'Is this the first document?',
    ]

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)

    cntvector_out = pd.DataFrame(X.toarray(),
                                 columns=vectorizer.get_feature_names())

    for i, v in enumerate(corpus):
        logger.info(f'Input text : {v}')
        logger.info(f'Output counter vector : {v}')
        logger.info(f'{cntvector_out.iloc[i]}')
Beispiel #35
0
# -------- Predicting with Random Forest
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_jobs=-1, random_state=0)
forest.fit(X_train, y_train)
y_train_forest = forest.predict(X_train)
y_pred_forest = forest.predict(X_test)
print('Random Forest Train Score:', np.mean(y_train == y_train_forest))
print('Random Forest Test Score:', np.mean(y_test == y_pred_forest))

# -------- Predicting with Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=0.1) # C controls the strength of regularization, smaller value, stronger regularization
lr.fit(X_train, y_train)
y_train_lr = lr.predict(X_train)
y_pred_lr = lr.predict(X_test)
print('Logistic Regression Train Score:', np.mean(y_train == y_train_lr))
print('Logistic Regression Test Score:', np.mean(y_test == y_pred_lr))

# -------- Predicting with Ensemble Voting based on the above classifiers
from sklearn.ensemble import VotingClassifier

eclf = VotingClassifier(estimators=[('xgboost', xgb), ('gbrt', gbrt), ('forest', forest),
                                    ('logistic regression', lr)],
                        voting='soft',
                        weights=None) #[2, 5, 2, 1]) # None: uses uniform weights
eclf = eclf.fit(X_train, y_train)
y_train_ensemble = eclf.predict(X_train)
y_pred_ensemble = eclf.predict(X_test)
print('Ensemble Voting Train Score:', np.mean(y_train == y_train_ensemble))
print('Ensemble Voting Test Score:', np.mean(y_test == y_pred_ensemble))
Beispiel #36
0
"""
Better performance with a Voting Classifier
Finally, you'll evaluate the performance of a voting classifier that takes the outputs of the models defined in the list classifiers and assigns labels by majority voting.

X_train, X_test,y_train, y_test, the list classifiers defined in a previous exercise, as well as the function accuracy_score from sklearn.metrics are available in your workspace.

INSTRUCTION
-----------
Import VotingClassifier from sklearn.ensemble.
Instantiate a VotingClassifier by setting the parameter estimators to classifiers and assign it to vc.
Fit vc to the training set.
Evaluate vc's test set accuracy using the test set predictions y_pred.
"""
# Import VotingClassifier from sklearn.ensemble
from sklearn.ensemble import VotingClassifier

# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers)

# Fit vc to the training set
vc.fit(X_train, y_train)

# Evaluate the test set predictions
y_pred = vc.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Voting Classifier: {:.3f}'.format(accuracy))
Beispiel #37
0
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import VotingClassifier as VC
mnb = MNB(alpha=10)
lr = LR(random_state=101)
rfc = RFC(n_estimators=80, criterion="entropy", random_state=42, n_jobs=-1)
clf = VC(estimators=[('mnb', mnb), ('lr', lr), ('rfc', rfc)], voting='hard')

clf.fit(X_train,y_train)

predict = clf.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, predict))
print('\n')
print(classification_report(y_test, predict))


def predictor(s):
    s = vectorizer.transform(s)
    pre = clf.predict(s)
    print(pre)

predictor(['I\'m on the Mexican, whoa oh oh, radio.'])

Beispiel #38
0
# print('Fact x: \n', qtable_X, '\n')
# print('Fact y: \n', qtable_decisioned, '\n')

clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

model1 = clf1.fit(qtable_X, qtable_decisioned_X)
model2 = clf2.fit(qtable_X, qtable_decisioned_X)
model3 = clf3.fit(qtable_X, qtable_decisioned_X)

eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                     ('gnb', clf3)],
                         voting='hard')
eclf1 = eclf1.fit(qtable_X, qtable_decisioned_Y)
y_pred1 = eclf1.predict(qtable_Y)
print(eclf1.predict(qtable_X))

np.array_equal(eclf1.named_estimators_.lr.predict(qtable_X),
               eclf1.named_estimators_['lr'].predict(qtable_X))

eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                     ('gnb', clf3)],
                         voting='soft')
eclf2 = eclf2.fit(qtable_X, qtable_decisioned_Y)
y_pred2 = eclf2.predict(qtable_Y)
print(eclf2.predict(qtable_X))

eclf3 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                     ('gnb', clf3)],
                         voting='soft',
clf.predict(test_[cols])
preds = clf.predict_proba(test_[cols])
#print(confusion_matrix(test['class'], clf.predict(test[cols])))
print (pd.crosstab(test_['TripType'], clf.predict(test_[cols]), rownames=["Actual"], colnames=["Predicted"]))
print (classification_report(test_['TripType'], clf.predict(test_[cols])))
score=accuracy_score(test_['TripType'],clf.predict(test_[cols]))
table.append([score])
print (table)

eclf = VotingClassifier(estimators = [('BaggingKNN', BaggingClassifier(KNeighborsClassifier(20))),
    ('RandomForest', RandomForestClassifier(10)),
    ('BaggingCART', BaggingClassifier(DecisionTreeClassifier()))],
    voting='soft', weights=[7,1,1])
eclf.fit(train[cols], train["TripType"])
#use the classifier to predict
predicted=eclf.predict(test[cols])
#print (accuracy_score(predicted,test['TripType']))
#print(classification_report(predicted,test['TripType']))

'''
OvR = OneVsRestClassifier(BaggingClassifier((LogisticRegression()))).fit(train_[cols], train_["TripType"])
predicted = OvR.predict(test_[cols])
print accuracy_score(predicted,test_['TripType'])

rf = RandomForestClassifier()
rf.fit(train_[cols], train_["TripType"])
predicted = rf.predict(test_[cols])
print accuracy_score(predicted,test_['TripType'])

ada = AdaBoostClassifier()
ada.fit(train_[cols], train_["TripType"])
Beispiel #40
0
m3 = GradientBoostingClassifier(loss='deviance', learning_rate=0.3,
                                n_estimators=200, max_depth=7,
                                min_samples_leaf=10, random_state=7,
                                max_features=None, verbose=1)

model = VotingClassifier(weights=[4, 5, 1], voting='soft',
                         estimators=[('SVM', m1), ('Rnd Forest', m2),
                                     ('Grad Boost', m3)])

model = runModel(model=model, trainX=X_train[0:30000], trainY=y_train[0:30000],
                 optimize=False, parameters=None, scoring='roc_auc')


print "Applying Model ..."
start = time()
y_pred = model.predict(X_test)
print("Model took %.2f seconds to predict vals" % (time() - start))


### Evaluation
print "Scoring Classifier..."
start = time()

score = model.score(X_test, y_test)
recall = metrics.recall_score(y_test, y_pred, average='binary')
auc = metrics.roc_auc_score(y_test, y_pred, average='macro')
confusion = metrics.confusion_matrix(y_test, y_pred, labels=[0, 1])

print "Score: \t \t Recall: \t AUC:\n", score, recall, auc
print("Model took %.2f seconds to score" % (time() - start))
Beispiel #41
0
def AllModels (file, in_columns, out_columns):		
	data = numpy.genfromtxt(file ,delimiter="," , autostrip = True )
	data = data[2:]
#	numpy.asarray(numpy.random.shuffle(data[:2400]))
	array = data
	X = array[50:-50,in_columns]
#	print X
	X = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0).fit_transform(X)
	Y = array[50:-50,out_columns]
	#print X
	Y = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0).fit_transform(Y)
#	print Y
	
	validation_size = 0.2
	#scoring = 'accuracy'

#	X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state = 0)
	X_train, X_validation, Y_train, Y_validation = X[0:2400], X[2400:], Y[0:2400], Y[2400:]
#	print X_train.pvalues_()
	lr = LogisticRegression()
	lr.fit(X_train, Y_train)
	predictions = lr.predict (X_validation)
	print 'LR : ' + str(accuracy_score(Y_validation, predictions))

	lda = LinearDiscriminantAnalysis()
	lda.fit(X_train, Y_train)
	predictions = lda.predict (X_validation)
	print 'LDA: ' +str(accuracy_score(Y_validation, predictions))

	knn = KNeighborsClassifier()
	knn.fit(X_train, Y_train)
	predictions = knn.predict (X_validation)
	print 'KNN: '+str(accuracy_score(Y_validation, predictions))

	

	rf = DecisionTreeClassifier()
	rf.fit(X_train, Y_train)
	predictions = rf.predict (X_validation)
	print 'DT : ' +str(accuracy_score(Y_validation, predictions))

	nb = GaussianNB()
	nb.fit(X_train, Y_train)
	predictions = nb.predict (X_validation)
	print 'NB : '+str(accuracy_score(Y_validation, predictions))	

	svm = SVC()
	svm.fit(X_train, Y_train)
	predictions = svm.predict (X_validation)
	print 'SVM: '+str(accuracy_score(Y_validation, predictions))	

	print '--------------------'
	rf=RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=None,
		 					min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
							 max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, 
		 					min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, 
		 					random_state=None, verbose=0, warm_start=False, class_weight=None)
	rf.fit(X_train, Y_train)
	print 'rf: '+str(rf.score(X_validation,Y_validation))
	et=ExtraTreesClassifier(n_estimators=300, criterion='gini', max_depth=None, min_samples_split=2, 
						min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', 
						max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, 
						bootstrap=False, oob_score=False, n_jobs=1, random_state=None, verbose=0, 
						warm_start=False, class_weight=None)
	et.fit(X_train, Y_train)
	print 'et: '+ str(et.score(X_validation,Y_validation))
	#cnf_matrix = confusion_matrix(Y_validation, y_pred)
	#print cnf_matrix


	rf = []

	for i in range(1,5):
		rf.append(ExtraTreesClassifier(n_estimators=300, criterion='gini', max_depth=None,
		 					min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
							 max_features=i*6, max_leaf_nodes=None, min_impurity_decrease=0.0, 
		 					min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, 
		 					random_state=None, verbose=0, warm_start=False, class_weight=None))
		#cnf_matrix = confusion_matrix(Y_validation, y_pred)
		#print cnf_matrix
	l = []
	for i in range(len(rf)):
		l.append((str(i),rf[i]))
	lda = LinearDiscriminantAnalysis()
#	l.append(('a',lda))
#	l.append(('b',lda))
	l.append(('c',lda))
	l.append(('d',lda))
	ecl = VotingClassifier(estimators = l, voting = 'hard')
#	ecl = AdaBoostClassifier(base_estimator = rf[0])
	ecl.fit(X_train, Y_train)
	y_pred = ecl.predict(X_validation)
	ret = accuracy_score(Y_validation, y_pred)
	print ret
	cnf_matrix = confusion_matrix(Y_validation, y_pred,labels=[-3,-2,-1,0,1,2,3])
	#print cnf_matrix
	s1 = 0.0
	for i in cnf_matrix:
		s1 = s1 + sum(i)
	print '---------------'
	s = 0.0
	for i in cnf_matrix[0:3,0:3]:
		s = s+sum(i)
	for i in cnf_matrix[4:7,4:7]:
		s = s+sum(i)
	print s/s1
	return ret
#
#clf = RandomForestClassifier()
clf1 = RandomForestClassifier(max_depth=5, n_estimators=20, max_features=1)
#clf2 = SVC(kernel="linear", C=0.025)	#Kernel: linear, poly, rbf
#clf2 = GradientBoostingClassifier(n_estimators=100)
#clf3 = linear_model.LogisticRegression(C=1e5)
clf2 = RandomForestClassifier(random_state=1)
clf3 = RandomForestClassifier(random_state=2)

print 'Testing... '

eclf1 = VotingClassifier(estimators=[('rf', clf1), ('svm', clf2), ('gb', clf3)], voting='hard')
eclf1 = eclf1.fit(X_data, Y_data)

predictions = eclf1.predict(x_data)

#print y_data
#print predictions	
#clf.fit(X_train, y_train)
print classification_report(y_data, predictions)

#predictions = clf.predict(X_test)

#print y_test
#print predictions
#scores = cross_validation.cross_val_score(clf, x_data, targets_data, cv=5)
#print ""
#print "CV Scores....: ", scores
#print "CV Mean score: ", sum(scores)/(len(scores)*1.0)
# ### 6.2 Ensemble modeling
# #### 6.2.1 Combining models
#
# I choosed a voting classifier to combine the predictions coming from the 5 classifiers.
#
# I preferred to pass the argument "soft" to the voting parameter to take into account the probability of each vote.

# In[75]:

votingC = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best),
                                       ('svc', SVMC_best), ('adac', ada_best),
                                       ('gbc', GBC_best)],
                           voting='soft',
                           n_jobs=4)

votingC = votingC.fit(X_train, Y_train)

# ### 6.3 Prediction
# #### 6.3.1 Predict and Submit results

# In[76]:

test_Survived = pd.Series(votingC.predict(test), name="Survived")

results = pd.concat([IDtest, test_Survived], axis=1)

results.to_csv("ensemble_python_voting.csv", index=False)

# If you found this notebook helpful or you just liked it , some upvotes would be very much appreciated - That will keep me motivated :)
# In[ ]:

votingC = VotingClassifier(estimators=[('svc', SVMC_best), ('rfc', RFC_best),
                                       ('lrc', LRC_best)],
                           voting='soft',
                           n_jobs=4)

votingC = votingC.fit(X_train, Y_train)

# ### 6. Prediction and Submission

# In[ ]:
'''
test_Survived = pd.Series(votingC.predict(test), name="Survived")

results = pd.concat([IDtest,test_Survived],axis=1)

results.to_csv("Titanic_test_set_prediction.csv",index=False)

'''

# In[ ]:

test_Survived = votingC.predict(test).astype(int)
submission = pd.DataFrame({"PassengerId": IDtest, "Survived": test_Survived})
submission.to_csv('Titanic_test_prediction_V9.csv', index=False)

# In[ ]:

accuracy_score(Y_train, votingC.predict(X_train))
            row = C1.query(query)

        C = float(row['select_C'])
        clf.set_params(clf__C=C)
        w = float(row['select_mean'])
        weights.append(w)

    # set weight to mean of CV scores for selected C
    vot_clf.set_params(weights=weights)

    vot_clf.fit(target_train_data.Tweet, true_stances)

    # predict on test data
    index = test_data.Target == target
    test_tweets = test_data.loc[index, 'Tweet']
    test_data.loc[index, 'Stance'] = vot_clf.predict(test_tweets)

    # predict on training data too to gauge overfitting
    index = train_data.Target == target
    train_tweets = train_data.loc[index, 'Tweet']
    pred_stances = vot_clf.predict(train_tweets)

    print classification_report(true_stances, pred_stances,
                            digits=4)

    macro_f = fbeta_score(true_stances, pred_stances, 1.0,
                          labels=['AGAINST', 'FAVOR'], average='macro')

    print 'macro-average of F-score(FAVOR) and F-score(AGAINST): {:.4f}\n'.format(
            macro_f)
Beispiel #46
0
    # LR
    # lr = LogisticRegression()
    # lr.fit(tfidf_train, target_train)
    # lr_pred = lr.predict(tfidf_test)
    # 随机森林
    # rf = RandomForestClassifier(random_state=1)
    # rf.fit(tfidf_train, target_train)
    # rf_pred=rf.predict(tfidf_test)

    # 组合分类器
    # Voting
    vote = VotingClassifier(estimators=[('nb', bayes), ('dt', tre),
                                        ('svm', svc)],
                            voting='hard')
    vote.fit(tfidf_train, target_train)
    vote_pred = vote.predict(tfidf_test)

    # Adaboost
    # ab = AdaBoostClassifier()
    # ab.fit(tfidf_train, target_train)
    # ab_pred = ab.predict(tfidf_test)
    # Bagging
    # bag = BaggingClassifier()
    # bag.fit(tfidf_train, target_train)
    # bag_pred = bag.predict(tfidf_test)
    # Gradient
    # MemoryError,已弃用
    # gb = GradientBoostingClassifier()
    # gb.fit(tfidf_train, target_train)
    # gb_pred = gb.predict(tfidf_test.toarray())
    # 非监督
# 随机森林
clf2 = RandomForestClassifier(n_estimators=50,
                              max_depth=1,
                              min_samples_split=4,
                              min_samples_leaf=54,
                              oob_score=True)
clf2.fit(X_train, y_train)
# 输出测试集的预测正确率
# print(clf2.score(X_test, y_test))
# print(confusion_matrix(y_test, clf2.predict(X_test)))

print("*" * 100)

# 决策树
tre = DecisionTreeClassifier(criterion='gini', splitter='best')
tre.fit(X_train, y_train)
# 输出测试集的预测正确率
# print(tre.score(X_test, y_test))
# print(confusion_matrix(y_test, tre.predict(X_test)))

eclf = VotingClassifier(estimators=[('svcnl', tre), ('rf', clf2),
                                    ('svc', cls1)],
                        voting='hard')
eclf.fit(X_train, y_train)
# 输出测试集的预测正确率
print("线性svm ", cls1.score(X_test, y_test))
print("非线性svm ", tre.score(X_test, y_test))
print("随机森林 ", clf2.score(X_test, y_test))
print("集成学习", eclf.score(X_test, y_test))
print(confusion_matrix(y_test, eclf.predict(X_test)))
Beispiel #48
0
def test_set_estimator_drop():
    # VotingClassifier set_params should be able to set estimators as drop
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 0, 0.5]).fit(X, y)

    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 1, 0.5])
    with pytest.warns(None) as record:
        with warnings.catch_warnings():
            # scipy 1.3.0 uses tostring which is deprecated in numpy
            warnings.filterwarnings("ignore", "tostring", DeprecationWarning)
            eclf2.set_params(rf='drop').fit(X, y)

    assert not record
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert dict(eclf2.estimators)["rf"] == 'drop'
    assert len(eclf2.estimators_) == 2
    assert all(
        isinstance(est, (LogisticRegression, GaussianNB))
        for est in eclf2.estimators_)
    assert eclf2.get_params()["rf"] == 'drop'

    eclf1.set_params(voting='soft').fit(X, y)
    with pytest.warns(None) as record:
        with warnings.catch_warnings():
            # scipy 1.3.0 uses tostring which is deprecated in numpy
            warnings.filterwarnings("ignore", "tostring", DeprecationWarning)
            eclf2.set_params(voting='soft').fit(X, y)

    assert not record
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = 'All estimators are dropped. At least one is required'
    with pytest.warns(None) as record:
        with pytest.raises(ValueError, match=msg):
            eclf2.set_params(lr='drop', rf='drop', nb='drop').fit(X, y)
    assert not record

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[0, 0.5],
                             flatten_transform=False).fit(X1, y1)

    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[1, 0.5],
                             flatten_transform=False)
    with pytest.warns(None) as record:
        with warnings.catch_warnings():
            # scipy 1.3.0 uses tostring which is deprecated in numpy
            warnings.filterwarnings("ignore", "tostring", DeprecationWarning)
            eclf2.set_params(rf='drop').fit(X1, y1)
    assert not record
    assert_array_almost_equal(
        eclf1.transform(X1),
        np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]]))
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1., 0.], [0., 1.]]]))
    eclf1.set_params(voting='hard')
    eclf2.set_params(voting='hard')
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
Beispiel #49
0
def analisar_features(train_text,
                      n_gram=1,
                      pos=False,
                      tags=False,
                      dep=False,
                      stem=False,
                      remove_stop_words=False,
                      remove_punct=False,
                      ent=False,
                      alpha=False,
                      lex=False,
                      file_path='log.txt'):

    print('Features utilizadas: \n')
    print('NGRAM: ' + str(n_gram) + '\n')
    print('tags: ' + str(tags) + '\n')
    print('pos: ' + str(pos) + '\n')
    print('dep: ' + str(dep) + '\n')
    print('stem: ' + str(stem) + '\n')
    print('ent: ' + str(ent) + '\n')
    print('alpha: ' + str(alpha) + '\n')
    print('Remove stopwords: ' + str(remove_stop_words) + '\n')
    print('Remove ponctuation: ' + str(remove_punct) + '\n\n')

    print('Processando texto...')

    processor = Preprocessor()
    train_text = processor.process_dataset(train_text,
                                           n_gram=n_gram,
                                           stem=stem,
                                           tags=tags,
                                           remove_stop_words=remove_stop_words,
                                           remove_punct=remove_punct,
                                           pos=pos,
                                           dep=dep,
                                           alpha=alpha,
                                           vectorizer='count',
                                           lex=lex)

    ##              TREINANDO NAIVE               ##

    print('Treinando modelo...')
    clf1 = LogisticRegression(solver='lbfgs',
                              multi_class='multinomial',
                              random_state=1)
    clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
    # clf3 = SGDClassifier(loss='hinge', penalty='l2',
    #                       alpha=1e-3, random_state=42,
    #                       max_iter=7, tol=None)
    clf3 = MultinomialNB()
    clf4 = SVC(C=100, gamma=5e-05, kernel='rbf')
    text_clf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                            ('mnb', clf3), ('svm', clf4)],
                                voting='hard')

    file = open(file_path, 'a')
    file.write('Features utilizadas: \n')
    file.write('NGRAM: ' + str(n_gram) + '\n')
    file.write('pos: ' + str(pos) + '\n')
    file.write('dep: ' + str(dep) + '\n')
    file.write('tags: ' + str(tags) + '\n')
    file.write('stem: ' + str(stem) + '\n')
    file.write('ent: ' + str(ent) + '\n')
    file.write('alpha: ' + str(alpha) + '\n')
    file.write('Remove stopwords: ' + str(remove_stop_words) + '\n')
    file.write('Remove ponctuation: ' + str(remove_punct) + '\n\n')

    kf = KFold(n_splits=10)
    f1 = []
    precision = []
    recall = []
    for train_index, test_index in kf.split(train_text):
        # print('Kfold train_index: ', train_index, '\ntest_index: ', test_index)

        X_train, X_test = extract_indexes(train_text,
                                          train_index), extract_indexes(
                                              train_text, test_index)
        y_train, y_test = extract_indexes(train_target,
                                          train_index), extract_indexes(
                                              train_target, test_index)

        print(' train target ', extract_indexes(train_target, train_index))
        print(' test target ', extract_indexes(train_target, test_index))
        text_clf.fit(X_train, y_train)
        y_pred = text_clf.predict(X_test)
        print(confusion_matrix(y_test, y_pred))
        print(
            metrics.classification_report(y_test,
                                          y_pred,
                                          target_names=categories))

        file.write(
            metrics.classification_report(y_test,
                                          y_pred,
                                          target_names=categories))

        precision.append(metrics.precision_score(y_test, y_pred))
        recall.append(metrics.recall_score(y_test, y_pred))
        f1.append(metrics.f1_score(y_test, y_pred))

    f1 = np.array(f1)
    precision = np.array(precision)
    recall = np.array(recall)

    f1_mean = f1.mean()
    precision_mean = precision.mean()
    recall_mean = recall.mean()

    f1_std = f1.std()
    precision_std = precision.std()
    recall_std = recall.std()

    print('Escrevendo arquivo de log\n')
    file.write('Recall Macro: ' + str(recall_mean) + ' (+/-) ' +
               str(recall_std * 2) + '\n')
    file.write('Precision Macro: ' + str(precision_mean) + ' (+/-) ' +
               str(precision_std * 2) + '\n')
    file.write('F1 Macro: ' + str(f1_mean) + ' (+/-) ' + str(f1_std * 2) +
               '\n')

    file.write('\n\n#############################################\n\n')
    file.close()
Beispiel #50
0
def SVM():
    #s = os.listdir(pathAttributes.data)
    file_path_data = os.path.join(pathAttributes.data, "*.csv")
    list_of_file = glob.glob(file_path_data)
    latest_data = max(
        list_of_file, key=os.path.getctime
    )  #-1 is idname.txt because number always bigger than charact
    print(latest_data)
    file_path = os.path.join(pathAttributes.data, latest_data)
    df = pd.read_csv(file_path, header=None)
    df.replace('?', -99999, inplace=True)
    X = np.array(df[df.columns[1:129]])
    X.reshape(-1, 1)
    y = np.array(df[df.columns[0]])

    #print(X)
    #scaler = StandardScaler()
    #X = scaler.fit_transform(X.astype(np.float64))
    """
    to-do-list
    1.scaled inputs
    2.cross_validation
    3.grid search
    """
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, y, test_size=0.2, random_state=42)
    try:
        clt = joblib.load(pathAttributes.SVM_model)
        #backup the current correct model
        su.copy(pathAttributes.SVM_model, pathAttributes.backup)
    except:
        #clt = SGDClassifier(loss="hinge",penalty="l2", random_state=42, warm_start=True)
        rnd_clt = RandomForestClassifier(n_estimators=867, max_leaf_nodes=6)
        svc_clt = svm.SVC(kernel="linear", C=1.6, probability=True)
        knn_clt = KNeighborsClassifier()
        clt = VotingClassifier(estimators=[('rc', rnd_clt), ('kc', knn_clt),
                                           ('sc', svc_clt)],
                               voting='soft')
        #clt = svm.LinearSVC(penalty='l2', loss="hinge", C=1.6)

        #randomizedsearchCV for randomforestclassifier and knn
    """    
    param = {
            'n_estimators':randint(low=1,high=1000),
            'max_leaf_nodes':[6,None],
             }
    rnd_search = RandomizedSearchCV(clt, param_distributions=param, cv=3, scoring='accuracy')
    rnd_search.fit(X_train,y_train)
    print(rnd_search.best_params_,rnd_search.best_score_)
    print(rnd_search.cv_results_)
    """
    """
    param = [
            {'n_neighbors':[1,3,5,7,9]},
             ]
    rnd_search = GridSearchCV(clt, param, cv=3, scoring='accuracy')
    rnd_search.fit(X_train,y_train)
    print(rnd_search.best_params_,rnd_search.best_score_)
    print(rnd_search.cv_results_)
    """

    a = cross_val_predict(clt, X_train, y_train, cv=3)
    b = cross_val_score(clt, X_train, y_train, cv=3)
    print(confusion_matrix(y_train, a), b)
    total = len(df.index)
    print(total)
    chunk_size = 1000
    epochs = 1000
    """
    classes = []
    for i in clt.classes_:
        classes.append(i)
        
    for i in np.unique(y):
        flag = False
        for j in classes:
            if j == i:
                flag = True
        if not flag:
            classes.append(i)
    chunks = int(total/chunk_size)+1
    for epoch in range(epochs):
        for chunk in range(chunks):
            starter = chunk * chunk_size
            if starter+chunk_size > total:
                
                clt.partial_fit(X_train[starter:total+1],y_train[starter:total+1],classes=classes)
                #clt.fit(X_train[starter:total+1],y_train[starter:total+1])
            else:
                clt.partial_fit(X.train[starter:starter+chunk_size],y_train[starter:starter+chunk_size],classes=np.unique(y))
                #clt.fit(X_train[starter:total+1],y_train[starter:chunk_size])
"""

    clt.fit(X_train, y_train)
    indecs = np.random.permutation(int(total / 3))
    X_validation = X_train[indecs]
    y_predict = clt.predict(X_test)
    print(confusion_matrix(y_test, y_predict))

    if clt.score(X_test, y_test) > 0.96:
        joblib.dump(clt, pathAttributes.SVM_model)
MNB_pipeline = Pipeline([('vect', CountVectorizer(ngram_range = (1, 2))), 
                         ('clf', MultinomialNB(alpha = 1.0, fit_prior = True)),
                        ])

KNN_pipeline = Pipeline([('vect', CountVectorizer()), 
                         ('clf', KNeighborsClassifier(n_neighbors = 20)),
                        ])
                        
SGD_pipeline = Pipeline([('vect', CountVectorizer()),
                        ('clf', linear_model.SGDClassifier(loss='log')),
                        ])
                        
LR_pipeline = Pipeline([('vect', CountVectorizer()), 
                        ('tfidf', TfidfTransformer(norm = 'l2', use_idf = True, smooth_idf = True, sublinear_tf = True)),
                        ('clf', LogisticRegression(warm_start = True, random_state = 1)),
                       ]) 
                     

eclf = VotingClassifier(estimators=[('MNB', MNB_pipeline), ('SGD',SGD_pipeline), ('LR', LR_pipeline)], voting = 'soft', weights = [3,2,3])
#('KNN', KNN_pipeline), 

eclf.fit(rev_train,labels_train)

#use soft voting to predict (majority voting)
pred=eclf.predict(rev_test)

for x in pred:
    fileWriter.write(str(x)+'\n')
fileWriter.close()
Beispiel #52
0
class Brain(object):
    """
    The Brain object
    Holds sklrean classifiers and makes it simpler to train using a dataframe
    """

    def __init__(self, lobes=False):
        """
        lobes = a dict of classifiers to use in the VotingClassifier
            defaults to RandomForestClassifier and DecisionTreeClassifier
        """
        if isString(lobes):
            try:
                self.load(lobes.split('.pickle')[0])
            except Exception as e:
                logger.exception(e)
                lobes = False
        if not lobes:
            lobes = {'rf': RandomForestClassifier(n_estimators=7,
                                                  random_state=666),
                     'dt': DecisionTreeClassifier()
                     }
        self.lobe = VotingClassifier(
            estimators=[(lobe, lobes[lobe]) for lobe in lobes],
            voting='hard',
            n_jobs=-1)
        self._trained = False

    def train(self, df, shuffle=True, preprocess=False, *args, **kwargs):
        """
        Takes a dataframe of features + a 'label' column and trains the lobe
        """
        if self._trained:
            logger.warning('Overwriting an already trained brain!')
            self._trained = False

        # shuffle data for good luck
        if shuffle:
            df = shuffleDataFrame(df)
        # scale train data and fit lobe
        x = df.drop('label', axis=1).values
        y = df['label'].values
        del df
        if preprocess:
            x = preprocessing.scale(x)
        logger.info('Training with %d samples', len(x))
        self.lobe.fit(x, y)
        self._trained = True

    def predict(self, df):
        """ Get a prediction from the votingLobe """
        return self.lobe.predict(prepDataframe(df).values)

    def score(self, df, test='predict'):
        """ Get a prediction score from the votingLobe """
        df = prepDataframe(df)
        return accuracy_score(df[test].values, df['label'].values)

    def save(self, location="brain"):
        """ Pickle the brain """
        if self._trained:
            joblib.dump(self.lobe, location + ".pickle")
            logger.info('Brain %s saved', location + '.pickle')
        else:
            return logger.error('Brain is not trained yet! Nothing to save...')

    def load(self, location="brain"):
        """ Loads a brain pickle """
        logger.info('Loading saved brain %s', location + '.pickle')
        self.lobe = joblib.load(location + ".pickle")
        self._trained = True
rfClf = RandomForestClassifier(n_estimators=500, random_state=0)  # 500 trees.
svmClf = SVC(probability=True, random_state=0)  # probability calculation
logClf = LogisticRegression(random_state=0)
nbclf = GaussianNB()

# constructing the ensemble classifier by mentioning the individual classifiers.
clf2 = VotingClassifier(estimators=[('rf', rfClf), ('svm', svmClf),
                                    ('log', logClf), ('nb', nbclf)],
                        voting='soft')

# train the ensemble classifier
clf2.fit(X_train, y_train)

from sklearn.metrics import precision_score, accuracy_score
x_actual, x_pred = y_train, clf2.predict(X_train)
precision_score_VC_train = precision_score(x_actual, x_pred)
accuracy_score_VC_train = accuracy_score(x_actual, x_pred)
print('The precision score of Voting classifier on TRAIN is : ',
      round(precision_score_VC_train * 100, 2), '%')
print('The accuracy score of Voting classifier on TRAIN is : ',
      round(accuracy_score_VC_train * 100, 2), '%')

from sklearn.metrics import precision_score, accuracy_score
y_actual, y_pred = y_test, clf2.predict(X_test)
precision_score_VC_test = precision_score(y_actual, y_pred)
accuracy_score_VC_test = accuracy_score(y_actual, y_pred)
print('The precision score of Voting classifier on Test is : ',
      round(precision_score_VC_test * 100, 2), '%')
print('The accuracy score of Voting classifier on Test is : ',
      round(accuracy_score_VC_test * 100, 2), '%')
Ypred = np.zeros(Y.shape, dtype='object')

print 'Classification using Ensemble'
for train_index, test_index in sss:
    print "Iter", itr, 
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
        
    clf1 = KNeighborsClassifierknn = KNeighborsClassifier(n_neighbors=5, weights= 'distance', metric='manhattan')
    clf2 = RandomForestClassifier(n_estimators=300, max_depth=30, bootstrap=False, class_weight="balanced", min_samples_split = 10)
    #clf3 = tree.DecisionTreeClassifier(max_depth=10, splitter='best', min_samples_split=81)
    
    clf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2)], voting='soft')
    
    clf = clf.fit(X_train, y_train)
    Ypred[test_index] = clf.predict(X_test)    
    result = clf.predict(X_train)
    tr_acc = float(np.sum(y_train==result))/float(y_train.shape[0])
        
    accuracy = float(np.sum(y_test==Ypred[test_index]))/float(y_test.shape[0])
    print " => Train Accuracy = %.4f, Accuracy = %.4f" % (tr_acc, accuracy)
    itr += 1

accuracy = float(np.sum(Y==Ypred))/float(Y.shape[0])
print "=== Total accuracy = ", accuracy, ' ==='
print ''
print clf
cm = confusion_matrix(Y, Ypred, labels=classes)
print cm
print clf
Beispiel #55
0
svm = SVC(probability=True, C=3)
svm.fit(X_train, y_train)

knn = KNeighborsClassifier(metric='minkowski', n_neighbors=70, p=1, weights='distance')
knn.fit(X_train, y_train)

grad = GradientBoostingClassifier(learning_rate=0.1, max_depth=2, max_features=2, n_estimators=400, subsample=1.0)
grad.fit(X_train, y_train)

fi = pd.DataFrame({'feature': X.columns.values, 'importance': grad.feature_importances_})
plotting.p12(fi)

estimators = [('knn', knn), ('svm', svm), ('grad', grad)]
eclf = VotingClassifier(estimators=estimators, voting='soft')
eclf.fit(X_train, y_train)
pred_eclf = eclf.predict(X_test)
# params = [{'svm__C': range(1,50)
# },
# {'grad__n_estimators': [400, 500, 700, 1000],
# 'grad__max_depth': range(1,6),
# 'grad__subsample': [0.2,  0.6, 1.0],
# 'grad__learning_rate': [0.1, 0.5, 1.0],
# 'grad__max_features': [1, 2, 4, 'auto', 'log2', None]
# },
# {'knn__n_neighbors': [3, 5, 10, 20, 40],
# 'knn__p': range(1,7)
# }]
# params = {'n_estimators': [400, 500, 700, 1000],
# 'max_depth': range(1,6),
# 'subsample': [0.2,  0.6, 1.0],
# 'learning_rate': [0.1, 0.5, 1.0],
Beispiel #56
0
print("xgb cross validation f1-score = ",
      cross_val_score(xgb, X_train, y_train, cv=5, scoring="f1_micro").mean()
      )  #xgboost with full train set (all features)
print("mlp cross validation f1-score = ",
      cross_val_score(mlp, X_train, y_train, cv=5, scoring="f1_micro").mean())

#initialize ensembles
estimators = []
estimators.append(('mlp', mlp))
estimators.append(('rf', rf))
estimators.append(('xgb', xgb))

#voting ensemlbe
ensemble = VotingClassifier(estimators, voting='soft', weights=[1, 1, 1])
ensemble.fit(X_train, y_train)
pred = ensemble.predict(X_test)
print('fscore:{0:.3f}'.format(f1_score(y_test, pred, average='micro')))

#meta classifier ensemble
stack = StackingCVClassifier(classifiers=[mlp, xgb, rf],
                             cv=2,
                             meta_classifier=lr,
                             use_probas=True)
stack.fit(X_train.values, y_train.values)
pred2 = stack.predict(X_test.values)
print('fscore:{0:.3f}'.format(f1_score(y_test, pred2, average='micro')))

from sklearn.metrics import confusion_matrix

confusion_lr = confusion_matrix(y_test, pred)
print(confusion_lr)
#for each outcome file, get training and test data to build a model and predict outcomes
for o in outcome_list:
    info,outcome=loadData('Outcomes' + '/' + o +'.txt')  
    
    #split data into training and test datasets
    train, test, labels_train, labels_test = train_test_split(info, outcome, test_size=0.33)

    counter = CountVectorizer()
    counter.fit(train)

    #count the number of times each term appears in a document and transform each doc into a count vector
    counts_train = counter.transform(train)#transform the training data
    counts_test = counter.transform(test)#transform the testing data
    
    #build a classifier on the training data using LR and NB
    clf1 = LogisticRegression()
    clf2 = MultinomialNB()

    #build a voting classifer - give logistic regression twice as much weight
    eclf = VotingClassifier(estimators=[('lr', clf1), ('mnb', clf2)], voting='soft', weights = [2,1])

    #train all classifier on the same datasets
    eclf.fit(counts_train,labels_train)

    #use hard voting to predict (majority voting)
    predicted=eclf.predict(counts_test)
 
    #print the accuracy
    print 'Accuracy of', o, 'prediction: ', accuracy_score(predicted,labels_test)
bn_pred = bnb.fit(Imdb_train_vectors.toarray(),
                  Imdb_train_labels).predict(Imdb_test_vectors.toarray())

print "Naive Bayes F1 Score on IMDB dataset: ", f1_score(Imdb_test_labels,
                                                         bn_pred,
                                                         average='macro')
print "Naive Bayes F1 Score on UCI dataset: ", UCI_scores.mean()
print "\n"
#print "Classification Report for Naive Bayes on IMDB dataset:\n", classification_report(Imdb_test_labels, y_pred)

###################### Voting Classifier ######################
voting_clf = VotingClassifier(estimators=[('nb', bnb), ('lg1', logistic_reg),
                                          ('svc', classifier_liblinear),
                                          ('mlp', ML_perceptron)],
                              voting='hard',
                              weights=[1, 1, 1, 1])
# voting_clf = VotingClassifier(estimators=[('nb', bnb), ('lg1', logistic_reg), ('svc', classifier_liblinear)],
#                       voting='hard', weights=[1,1,1])
UCI_scores = cross_val_score(voting_clf,
                             UCI_train_vectors,
                             UCI_train_labels,
                             cv=10,
                             scoring='f1_macro')
voting_clf.fit(Imdb_train_vectors, Imdb_train_labels)
voting_clf_pred = voting_clf.predict(Imdb_test_vectors)

print "Voting Classifier F1 Score on IMDB dataset: ", f1_score(
    Imdb_test_labels, voting_clf_pred, average='macro')
print "Voting Classifier F1 Score on UCI dataset: ", UCI_scores.mean()
#print "Classification Report for Voting Classifier on IMDB dataset:\n", classification_report(Imdb_test_labels, voting_clf)
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'):
    #NOTE we might not need xtltrain
    # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present
    #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength
    # ytest is optional and depends on if you are using a testing set or the practice set

    # remove NaN, Inf, and -Inf values from the xtest feature matrix
    xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget)
    # print 'finished removal of Nans'

    ytrain = np.ravel(ytrain)
    ytarget = np.ravel(ytarget)


    #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector
    count = 0
    # print numfiers

    predictionMat = np.empty((xtest.shape[0],numfiers))
    predictionStringMat = []
    finalPredMat = []
    targetStringMat = []
    targets1 = []
    predictions1 = []

    # svc1 = SVC()
    # svc1.fit(xtrain,ytrain)
    # ytest = svc1.predict(xtest)
    # predictionMat[:,count] = ytest
    # count+=1
    if count < numfiers:
        # votingClassifiers combine completely different machine learning classifiers and use a majority vote
        clff1 = SVC()
        clff2 = RFC(bootstrap=False)
        clff3 = ETC()
        clff4 = neighbors.KNeighborsClassifier()
        clff5 = quadda()



        eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
        eclf = eclf.fit(xtrain,ytrain)
        #print(eclf.score(xtest,ytest))
        # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
        #     cla
        #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
        #     print ()
        ytest = eclf.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:

        bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
        bagging2.fit(xtrain,ytrain)
        #print bagging2.score(xtest,ytest)
        ytest = bagging2.predict(xtest)
        predictionMat[:,count] = ytest
        count += 1


    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        ytest = tree2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        ytest = bagging1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        svc1 = SVC()
        svc1.fit(xtrain,ytrain)
        ytest = svc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        ytest = qda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        ytest = tree1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        ytest = knn1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        ytest = lda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        ytest = tree3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        ytest = bagging3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        ytest = bagging4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        ytest = tree4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        ytest = tree6.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        ytest = knn2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        ytest = knn3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        ytest = knn4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        ytest = knn5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        ytest = ncc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        ytest = tree5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    # print xtltest
    # print len(ytest)
    for colCount in range(predictionMat.shape[1]):
        tempCol = predictionMat[:,colCount]
        if testing:
            modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0)
        else:
            modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0)

        ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0)
        if testing:
             modeStr = temppredVec2Str(modeCol,grids)
        else:
            modeStr = predVec2Str(modeCol)
        modeStrans = predVec2Str(ytarg)
        predictionStringMat.append(modeStr)
        predictions1.append(modeCol)
        finalPredMat += map(int,modeCol)
        targetStringMat.append(modeStrans)
        targets1.append(ytarg)
        if testing == False:
            if ytarget != None:
                #print targets1
                #print ""
                #print predictions1
                confusionme = confusion_matrix(targets1[0],predictions1[0])
                #print "Confusion Matrix is: "
                #print confusionme


    return predictionStringMat, targetStringMat, finalPredMat
Beispiel #60
0
#save best model
etc_best = etc_gs.best_estimator_

#check best n_estimators value
print('etc: ',etc_gs.best_params_, etc_gs.best_score_)



#Voting classifier
from sklearn.ensemble import VotingClassifier

#create a dictionary of our models
estimators=[('rf', rf_best),('knc',knc_best), ('log_reg', log_reg), ('etc',etc_best),
            ('gbc', gbc_best), ('SVC',svc_best),('ADC',adc_best),('xgb',xgb_best)]
#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='soft')

#fit model to training data
ensemble.fit(X_train, y_train)

#test our model on the test data
print(ensemble.score(X_test, y_test))



submission = pd.DataFrame()
prediction = pd.DataFrame(ensemble.predict(test_cleaned.loc[:,'CabinA':].values))

submission['Survived']  = prediction[0]
submission['PassengerId']=test['PassengerId']
submission.to_csv('submission.csv',index = False)