Example #1
1
class Ensemble:

	def __init__(self, data):
		self.rf = RandomForestClassifier(n_estimators=80, n_jobs=-1, min_samples_split=45, criterion='entropy')
		self.lda = LDA()
		self.dec = DecisionTreeClassifier(criterion='entropy')
		self.ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.25)

		self.make_prediction(data)


	def make_prediction(self, data):
		'''
		Make an ensemble prediction
		'''
		self.rf.fit(data.features_train, data.labels_train)
		self.lda.fit(data.features_train, data.labels_train)
		self.dec.fit(data.features_train, data.labels_train)
		self.ada.fit(data.features_train, data.labels_train)

		pre_pred = []
		self.pred = []

		ada_pred = self.ada.predict(data.features_test)
		rf_pred = self.rf.predict(data.features_test)
		lda_pred = self.lda.predict(data.features_test)
		dec_pred = self.dec.predict(data.features_test)

		for i in range(len(rf_pred)):
			pre_pred.append([ rf_pred[i], lda_pred[i], dec_pred[i], ada_pred[i] ])

		for entry in pre_pred:
			pred_list = sorted(entry, key=entry.count, reverse=True)
			self.pred.append(pred_list[0])
Example #2
0
def plot_adaboost():
    X, y = make_moons(noise=0.3, random_state=0)

    # Create and fit an AdaBoosted decision tree
    est = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                             algorithm="SAMME.R",
                             n_estimators=200)

    sample_weight = np.empty(X.shape[0], dtype=np.float)
    sample_weight[:] = 1. / X.shape[0]

    est._validate_estimator()
    est.estimators_ = []
    est.estimator_weights_ = np.zeros(4, dtype=np.float)
    est.estimator_errors_ = np.ones(4, dtype=np.float)

    plot_step = 0.02

    # Plot the decision boundaries
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    fig, axes = plt.subplots(1, 4, figsize=(14, 4), sharey=True)
    colors = ['#d7191c', '#fdae61', '#ffffbf', '#abd9e9', '#2c7bb6']
    c = lambda a, b, c: map(lambda x: x / 254.0, [a, b, c])
    colors = [c(215, 25, 28),
              c(253, 174, 97),
              c(255, 255, 191),
              c(171, 217, 233),
              c(44, 123, 182),
              ]

    for i, ax in enumerate(axes):
        sample_weight, estimator_weight, estimator_error = est._boost(i, X, y, sample_weight)
        est.estimator_weights_[i] = estimator_weight
        est.estimator_errors_[i] = estimator_error
        sample_weight /= np.sum(sample_weight)

        Z = est.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z,
                    cmap=matplotlib.colors.ListedColormap([colors[1], colors[-2]]),
                    alpha=1.0)
        ax.axis("tight")

        # Plot the training points
        ax.scatter(X[:, 0], X[:, 1],
                   c=np.array([colors[0], colors[-1]])[y],
                   s=20 + (200 * sample_weight) ** 2, cmap=plt.cm.Paired)
        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        ax.set_xlabel('$x_0$')

        if i == 0:
            ax.set_ylabel('$x_1$')

    plt.tight_layout()
    plt.show()
Example #3
0
def cvalidate():
    from sklearn import cross_validation

    trainset = np.genfromtxt(open('train.csv','r'), delimiter=',')[1:]
    X = np.array([x[1:8] for x in trainset])
    y = np.array([x[8] for x in trainset])
    #print X,y
    import math
    for i, x in enumerate(X):
        for j, xx in enumerate(x):
            if(math.isnan(xx)):
                X[i][j] = 26.6
   
    #print X[0:3]
    #print y[0:3]
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0)

    X_train, X_test = decomposition_pca(X_train, X_test)
    
    bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=20, algorithm = 'auto'), algorithm="SAMME", n_estimators = 200)
    bdt.fit(X_train, y_train)
    
    

    print bdt.score(X_test, y_test)
Example #4
0
def Adaboost(TrainData,TestData):
    features=['Time','Season','Hour','Minute','District']

    clf = AdaBoostClassifier(tree.DecisionTreeClassifier(),n_estimators=30)

    size=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
    for i in range(0,len(size)):
        train,validation= train_test_split(TrainData, train_size=size[i])

        while len(set(train['Category'])) != len(set(validation['Category'])):
            train,validation= train_test_split(TrainData, train_size=size[i])
        clf = clf.fit(train[features], train['Category'])
        """stop = timeit.default_timer()
        print "Runnin  time adaboost is ", stop-start"""
        predicted=np.array(clf.predict_proba(validation[features]))
        model=clf.predict(train[features])
        model1=clf.predict(validation[features])

        #scores = cross_val_score(clf, validation[features], validation['Category'])
        #print "Scores mean is",scores.mean()
        #accuracy
        print "Training accuracy is", accuracy_score(train['Category'].values.tolist(),model)
        print "Validation accuracy is",accuracy_score(validation['Category'].values.tolist(),model1)
        print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro')
        print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro')
        print "Log loss is", log_loss(validation['Category'].values.tolist(),predicted,eps=1e-15, normalize=True, sample_weight=None)


        #writing to file
        """Category_new=[]
Example #5
0
def runAdaBoost(arr):#depth, n_est,  lrn_rate=1.0): # removing filename for the scipy optimise thing '''filename,'''
    #ada = AdaBoostClassifier(n_estimators=100)
    global file_dir, nEvents, solutionFile, counter
    print 'iteration number ' + str(counter)
    counter+=1
    depth = int(arr[0]*100)
    n_est = int(arr[1]*100)
    lrn_rate = arr[2]
    if depth <= 0 or n_est <= 0 or lrn_rate <= 0:
        return 100

    fname = 'ada_dep'+str(depth)+'_est'+str(n_est)+'_lrn'+str(lrn_rate)
    filename = fname
    ada = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=depth),
                             algorithm="SAMME",
                             n_estimators=n_est)#,n_jobs=4)
    print "AdaBoost training"
    ada.fit(sigtr[train_input].values,sigtr['Label'].values)
    print "AdaBoost testing"
    ada_pred = ada.predict(sigtest[train_input].values)
    solnFile(filename,ada_pred,sigtest['EventId'].values)#
    print "AdaBoost finished"
    # added for teh scipy optimise thing
    ams_score = ams.AMS_metric(solutionFile, file_dir+fname+'.out', nEvents)
    print ams_score
    logfile.write(fname + ': ' + str(ams_score)+'\n')
    return -1.0*float(ams_score) # since we are minimising
Example #6
0
def main():

    trainset = np.genfromtxt(open('train.csv','r'), delimiter=',')[1:]
    X = np.array([x[1:8] for x in trainset])
    y = np.array([x[8] for x in trainset])
    #print X,y
    import math
    for i, x in enumerate(X):
        for j, xx in enumerate(x):
            if(math.isnan(xx)):
                X[i][j] = 26.6
   
    
    testset = np.genfromtxt(open('test.csv','r'), delimiter = ',')[1:]

    test = np.array([x[1:8] for x in testset])
    for i, x in enumerate(test):
        for j, xx in enumerate(x):
            if(math.isnan(xx)):
                test[i][j] = 26.6
   

    X, test = decomposition_pca(X, test)

    bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=20, algorithm = 'auto'), algorithm="SAMME", n_estimators = 200)
    bdt.fit(X, y)
    


    print 'PassengerId,Survived'
    for i, t in enumerate(test):
        print '%d,%d' % (i + 892, int(bdt.predict(t)[0]))
def ada_prediction(features_train, labels_train, features_test, ids):

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(features_train, labels_train, random_state=1301, stratify=labels_train, test_size=0.3)

    clf = AdaBoostClassifier(RandomForestClassifier(bootstrap=True,
                                                    criterion='entropy', max_depth=None, max_features=2,
                                                    max_leaf_nodes=16, min_samples_split=10, n_estimators=1000,
                                                    n_jobs=-1, oob_score=False),
                              algorithm="SAMME",
                              n_estimators=200)


    #clf_acc = clf.fit(X_train, y_train)
    # print(clf.best_estimator_)
    #feature_importance = clf.feature_importances_
    #print (feature_importance)

    #pred = clf_acc.predict_proba(X_test)[:,1]
    #print (y_test, pred)
    # acc = accuracy_score(y_test, pred)
    # print ("Acc {}".format(acc))

    clf = clf.fit(features_train, labels_train)

    pred = clf.predict_proba(features_test)[:,1]

    predictions_file = open("data/canivel_ada_forest.csv", "wb")
    predictions_file_object = csv.writer(predictions_file)
    predictions_file_object.writerow(["ID", "TARGET"])
    predictions_file_object.writerows(zip(ids, pred))
    predictions_file.close()
def train_adaboost(features, labels, learning_rate, n_lab, n_runs, n_estim, n_samples):
    uniqLabels = np.unique(labels)
    print 'Taking ', str(n_lab), ' labels'
    uniqLabels = uniqLabels[:n_lab]
    used_labels = uniqLabels
    pbar = start_progressbar(len(uniqLabels), 'training adaboost for %i labels' %len(uniqLabels))
    allLearners = []
    for yy ,targetLab in enumerate(uniqLabels):
        runs=[]
        for rrr in xrange(n_runs):
            #import ipdb;ipdb.set_trace()
            feats,labs = get_binary_sets(features, labels, targetLab, n_samples)
            #print 'fitting stump'
            #import ipdb;ipdb.set_trace()
            baseClf = DecisionTreeClassifier(max_depth=4, min_samples_leaf=10, min_samples_split=10)
            baseClf.fit(feats, labs)
            ada_real = AdaBoostClassifier( base_estimator=baseClf, learning_rate=learning_rate,
                                      n_estimators=n_estim,
                                      algorithm="SAMME.R")
            #import ipdb;ipdb.set_trace()
            runs.append(ada_real.fit(feats, labs))
        allLearners.append(runs)
        update_progressbar(pbar, yy)
    end_progressbar(pbar)
    
    return allLearners, used_labels
Example #9
0
def adaBoost(n,x,t,x_test,t_test):
    clf = AdaBoostClassifier(n_estimators = n)
    clf.fit(x, t)
    predictions = clf.predict(x_test)
    X = confusion_matrix(t_test,predictions)
    classificationRate = (X[1,1]+X[0,0]) / sum(sum(X))
    return(1-classificationRate)
def test_oneclass_adaboost_proba():
    # Test predict_proba robustness for one class label input.
    # In response to issue #7501
    # https://github.com/scikit-learn/scikit-learn/issues/7501
    y_t = np.ones(len(X))
    clf = AdaBoostClassifier().fit(X, y_t)
    assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
def train_classifiers(X_data, y_data):
    ############ Linear SVM: 0.908 #############
    clf_LSVM = svm.SVC(kernel = 'linear')
    clf_LSVM.fit(X_data, y_data)
    
    ############ MultinomialNB: 0.875 #############
    clf_MNB = MultinomialNB()
    clf_MNB.fit(X_data, y_data)
    
    ############ Random Forest: 0.910 #############
    clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy')
    clf_RF.fit(X_data, y_data)
    
    ############ Extra Tree: 0.915 ##################
    clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0)
    clf_ETC.fit(X_data, y_data)
    
    ############ AdaBoost: 0.88 ##################
    clf_Ada = AdaBoostClassifier()
    clf_Ada.fit(X_data, y_data)
    
    ############ rbf SVM: 0.895 #############
    clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf')
    clf_rbf.fit(X_data, y_data)
    
    ############ GradientBoosting: 0.88 #############
    clf_GBC = GradientBoostingClassifier()
    clf_GBC.fit(X_data, y_data)
    
    return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC    
def createAdaBoostClassifier(trainingVectors, targetValues):
    

    clf = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)
    clf.fit(trainingVectors, targetValues, targetValues*10000)
    
    return(clf)
Example #13
0
class DomainTypeClassifier(object):
    def __init__(self, radius, window_mode=False):
        self.classifier = AdaBoostClassifier(
            DecisionTreeClassifier(max_depth=2),
            n_estimators=20,
            learning_rate=1,
            algorithm="SAMME")
        # svm.SVC(kernel='rbf')
        self.radius = radius
        self.window_mode = window_mode

    def train(self, dataset):
        k = self.radius if not self.window_mode else 2 * self.radius + 1
        rin, rout = dataset.getData(k, self.window_mode)
        print("fitting", len(rin))
        self.classifier.fit(np.asarray(rin, float), np.asarray(rout, float))

    def predict(self, ns):
        k = self.radius if not self.window_mode else 2 * self.radius + 1
        to_predict = []
        for i in range(len(ns)):
            if not self.window_mode:
                to_predict.append(encode(create_region(ns, i, k)))
            else:
                if i > len(ns) - k:
                    break
                to_predict.append(encode(ns[i:i+k]))
        return int(Counter(self.classifier.predict(
            np.asarray(to_predict, float))).most_common(1)[0][0])
Example #14
0
def cvalidate():
    targetset = np.genfromtxt(open('trainLabels.csv','r'), dtype='f16')
    y = [x for x in targetset]

    trainset = np.genfromtxt(open('train.csv','r'), delimiter=',', dtype='f16')
    X = np.array([x for x in trainset])
    
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0)

    X_train, X_test = decomposition_pca(X_train, X_test)

    #SVM

    c_range = 10.0 ** np.arange(6.5,7.5,.25)
    gamma_range = 10.0 ** np.arange(-2.5,0.5,.25)
    parameters = {'kernel':['rbf'], 'C':c_range,  'gamma':gamma_range} 
    svr = SVC()

    clf = grid_search.GridSearchCV(svr, parameters)
    

    clf.fit(X_train, y_train)
    bdt = AdaBoostClassifier(base_estimator = clf.best_estimator_,
                         algorithm="SAMME",
                         n_estimators=100)

    
    #bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=10))
    bdt.fit(X_train, y_train)
    

    print bdt.score(X_test, y_test)
Example #15
0
def ANGEL_training(cds_filename, utr_filename, output_pickle, num_workers=3):
    coding = [ r for r in SeqIO.parse(open(cds_filename), 'fasta') ]
    utr = [ r for r in SeqIO.parse(open(utr_filename), 'fasta') ]

    o_all = c_ORFscores.CDSWindowFeat()
    add_to_background(o_all, coding)
    add_to_background(o_all, utr)

    data_pos = get_data_parallel(o_all, coding, [0], num_workers)
    data_neg = get_data_parallel(o_all, utr, [0, 1, 2], num_workers)

    data = data_neg + data_pos
    target = [0]*len(data_neg) + [1]*len(data_pos)
    data = np.array(data)

    print >> sys.stderr, "data prep done, running classifier...."
    bdt = AdaBoostClassifier(n_estimators=50)
    bdt.fit(data, target)

    print >> sys.stderr, "classifier trained. putting pickle to", output_pickle

    with open(output_pickle, 'wb') as f:
        dump({'bdt':bdt, 'o_all':o_all}, f)

    return data, target, bdt
Example #16
0
def some(X, Y, X_test, Y_test):
    ada = AdaBoostClassifier()
    print "Train Model ---"
    t1 = time()
    ada.fit(X, Y)
    t2 = time()
    print "Model Trained ----------", t2 - t1
    test_errors = []
    cur = 1
    Y_test2 = []
    for k in Y_test:
        Y_test2.append(k[0])
    print "Testing: "
    print  Y_test2
    pred =  ada.predict(X_test)
    print pred
    accu =  1. - accuracy_score(y_true= Y_test2, y_pred= pred)
    print accu
    print "STAGED _____________"
    for test_predict in (
        ada.staged_predict(X_test)):


            test_errors.append(
            1. - accuracy_score(test_predict, Y_test2))


    print  "errorss : "
    print test_errors
Example #17
0
def ada(xtrain, ytrain, train_weight, tests, test_weight):
    #Initiate the training model
    clf = AdaBoostClassifier()
    mistakes = 0
    cost = 0
    #Fit the model
    clf.fit(xtrain, ytrain)
    vector_count = 0
    #Iterate over the tests
    for i in range(len(tests)):
        #Get the number of elements in each test
        vector_count += len(tests[i])
        test_count = 0
        #Iterate over each feature in the tests
        for vector in tests[i]:
            #Predict based on each feature
            prediction = clf.predict(vector)
            #Determine the cost
            cost += test_weight[i][test_count] * pen[i][prediction[0]]
            #Count the number of mistakes
            if pen[i][prediction[0]] > 0:
                #print("Incorrectly Predicted " + str(Segments.reverse_mapping[i]) + " as " + str(Segments.reverse_mapping[prediction[0]]))
                mistakes += 1
                test_count += 1

    print("Number of mistakes: " + str(mistakes) + " of " + \
            str(vector_count) + ", " + \
            str((1.-float(mistakes)/float(vector_count))*100) + \
            "% accurate")

    return cost
Example #18
0
def ada_boost_dt():
    """
    Submission: ada_boost_dt_0707_03.csv
    E_val: 0.854350
    E_in: 0.889561
    E_out: 0.8832315976033993
    """
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.cross_validation import cross_val_score
    from sklearn.pipeline import Pipeline

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    ab = AdaBoostClassifier(n_estimators=300)

    scores = cross_val_score(ab, X_scaled, y, cv=5, n_jobs=-1)
    logger.debug('CV: %s', scores)
    logger.debug('E_val: %f', sum(scores) / len(scores))

    ab.fit(X_scaled, y)

    logger.debug('E_in: %f', Util.auc_score(ab, X_scaled, y))

    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('ab', ab)]), 'ada_boost_dt_0707_03')
Example #19
0
class AdaBoost:
    def __init__(self, data, n_estimators=50, learning_rate=1.0):
        features, weights, labels = data
        self.clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
        self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None
        self.dataset = split_dataset(features, weights, labels)

    def train(self):
        """
        Train Ada Boost on the higgs dataset
        """
        self.clf = self.clf.fit(self.dataset['training']['features'], self.dataset['training']['labels'])

    def predict(self):
        """
        Predict label using Ada Boost
        :return:
        """
        self.predictions = self.clf.predict(self.dataset['test']['features'])

    def evaluate(self):
        self.trnaccuracy = self.clf.score(self.dataset['training']['features'],
                                          self.dataset['training']['labels'],
                                          sample_weight=self.dataset['training']['weights'])
        self.tstaccuracy = self.clf.score(self.dataset['test']['features'],
                                          self.dataset['test']['labels'],
                                          sample_weight=self.dataset['test']['weights'])
Example #20
0
def ab_predictedValue():
    print '----------AdaBoost----------'
    ab_clf = AdaBoostClassifier(n_estimators = NoOfEstimators)
    ab_clf.fit(train_df[features], train_df['SeriousDlqin2yrs'])
    ab_predictedValue = ab_clf.predict_proba(test_df[features])
    print 'Feature Importance = %s' % ab_clf.feature_importances_
    return ab_predictedValue[:,1]
Example #21
0
File: adaboost.py Project: kbai/uss
def main():
    print("gradient boosting  classifier!")

    X,Y,Xtest = importdata()
    print(Y.shape)
    param_grid={
            "n_estimators":[10,100,200,2000,20000],
            "base_estimator__n_estimators":[10,20,50,100,200],
            "base_estimator__min_samples_split":[5,10,20,50]
            }

    ab=AdaBoostClassifier(RandomForestClassifier())
    Gridsearch_impl(X,Y,ab,param_grid,5)

#    for i in range(10,11,5):
#        clf = DecisionTreeClassifier(min_samples_split=i)
#        rf = RandomForestClassifier(n_estimators = 100,random_state=0,min_samples_split=i)
#        ab = AdaBoostClassifier(rf,n_estimators = 10)
        #ab = GradientBoostingClassifier(n_estimators = 100)
#        score = cross_validation.cross_val_score(ab,X,Y,cv=3)
      #  print(score)
      #  print("average score %f"%np.mean(score))
      #  print("std %f"%np.std(score))
      #  ab.fit(X,Y)
   


    Ytest = ab.predict(Xtest)
    output(Ytest,'submit3.csv')
def adaboost(df,label_name,feature_names,features_len,ifeat,n_estimators=100):
    # TODO: just copied from RF, needs real code
    from sklearn.ensemble import AdaBoostClassifier
    print('---------------------------------------------------')
    print(ifeat,features_len,'Adaboost, features:',feature_names)
    df_train_Y = df[label_name]
    train_Y = df_train_Y.values.ravel()  # turn from 2D to 1D

    df_train_X = df[feature_names]
    train_X = df_train_X.values

    clf =AdaBoostClassifier(n_estimators=n_estimators)
    clf = clf.fit(train_X,train_Y)
    # output = clf.predict(train_X)
    E_in = round(1.-clf.score(train_X, train_Y),5) # 'in sample' error
    #print('\tE_in :',E_in)

    # -----
    # Kfold as estimator for 'out of sample' error
    kf=skl.cross_validation.KFold(n=len(train_X), n_folds=5)
    cv_scores=skl.cross_validation.cross_val_score(clf, train_X, y=train_Y, cv=kf)
    E_out = round(1.-np.mean(cv_scores),5)
    #print("\tE_out:",E_out)

    return E_in,E_out
Example #23
0
def trainClassifier(dataDir, trialName, NUMFISH):


    
    ch = circularHOGExtractor(6,4,3) 
    nFeats = ch.getNumFields()+1
    trainData = np.array([])#np.zeros((len(lst0)+len(lst0c)+len(lst1),nFeats))
    targetData = np.array([])#np.hstack((np.zeros(len(lst0)+len(lst0c)),np.ones(len(lst1))))
    for tr in range(NUMFISH):
        directory = dataDir + '/process/' + trialName + '/FR_ID' + str(tr) + '/'
        files = [name for name in os.listdir(directory)]
        thisData = np.zeros((len(files),nFeats))
        thisTarget = tr*np.ones(len(files))
        i = 0
        for imName in files:
            sample = cv2.imread(directory + imName)
            thisIm = cv2.cvtColor(sample, cv2.COLOR_BGR2GRAY)
            
            thisData[i,:] = np.hstack((ch.extract(thisIm), np.mean(thisIm)))
            i = i + 1
        trainData = np.vstack((trainData, thisData)) if trainData.size else thisData
        targetData = np.hstack((targetData, thisTarget)) if targetData.size else thisTarget

    #clf = svm.SVC()
    clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=50)
    y_pred = clf.fit(trainData,targetData)
    pickle.dump(clf, open( dataDir + '/process/' + trialName + '/boost' + trialName + '.p',"wb"))
    y_pred = clf.predict(trainData)
    print("Number of mislabeled points out of a total %d points : %d" % (trainData.shape[0],(targetData != y_pred).sum()))
    def __init__(self,n_estimators=50, learning_rate=1.0, algorithm='SAMME.R',\
        criterion='gini', splitter='best', max_depth=5, min_samples_split=2, min_samples_leaf=1,\
        max_features=None, random_state=None, min_density=None, compute_importances=None):

        base_estimator=DecisionTreeClassifier()
        self.base_estimator = base_estimator
        self.base_estimator_class = self.base_estimator.__class__
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.algorithm = algorithm
        self.splitter = splitter
        self.max_depth = max_depth
        self.criterion = criterion
        self.max_features = max_features
        self.min_density = min_density
        self.random_state = random_state
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.compute_importances = compute_importances
        
        self.estimator = self.base_estimator_class(criterion=self.criterion, splitter=self.splitter, max_depth=self.max_depth,\
                min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features,\
                random_state=self.random_state, min_density=self.min_density, compute_importances=self.compute_importances)
        
        AdaBoostClassifier.__init__(self, base_estimator=self.estimator, n_estimators=self.n_estimators, learning_rate=self.learning_rate, algorithm=self.algorithm)
def do_all_study(X,y):
    
    names = [ "Decision Tree","Gradient Boosting",
             "Random Forest", "AdaBoost", "Naive Bayes"]

    classifiers = [
        #SVC(),
        DecisionTreeClassifier(max_depth=10),
        GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1),
        RandomForestClassifier(max_depth=10, n_estimators=20, max_features=1),
        AdaBoostClassifier()]
    for name, clf in zip(names, classifiers):
        estimator,score = plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')


    clf_GBC = GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1)
    param_name = 'n_estimators'
    param_range = [1, 5, 10, 20,40]

    plot_validation_curve(clf_GBC, X_train, y_train,
                          param_name, param_range, scoring='roc_auc')
    clf_GBC.fit(X_train,y_train)
    y_pred_GBC = clf_GBC.predict_proba(X_test)[:,1]
    print("ROC AUC GradientBoostingClassifier: %0.4f" % roc_auc_score(y_test, y_pred_GBC))

    clf_AB = AdaBoostClassifier()
    param_name = 'n_estimators'
    param_range = [1, 5, 10, 20,40]

    plot_validation_curve(clf_AB, X_train, y_train,
                          param_name, param_range, scoring='roc_auc')
    clf_AB.fit(X_train,y_train)
    y_pred_AB = clf_AB.predict_proba(X_test)[:,1]
    print("ROC AUC AdaBoost: %0.4f" % roc_auc_score(y_test, y_pred_AB))
Example #26
0
def train_adaboost(features, labels):
    uniqLabels = np.unique(labels)
    print 'TAKING ONLY ', str(N_LAB), ' LABELS FOR SPEED '
    uniqLabels = uniqLabels[:N_LAB]
    
    allLearners = []
    for targetLab in uniqLabels:
        print 'processing for label ', str(targetLab)
        runs=[]
        #import ipdb;ipdb.set_trace()
        for rrr in xrange(N_RUNS):
            #import ipdb;ipdb.set_trace()
            feats,labs = get_binary_sets(features, labels, targetLab)
            #print 'fitting stump'
            #import ipdb;ipdb.set_trace()
            baseClf = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)
            baseClf.fit(feats, labs)
            ada_real = AdaBoostClassifier( base_estimator=baseClf, learning_rate=learning_rate,
                                      n_estimators=N_ESTIM,
                                      algorithm="SAMME.R")
            #import ipdb;ipdb.set_trace()
            runs.append(ada_real.fit(feats, labs))
        allLearners.append(runs)
    
    return allLearners
def prediction(feat,label):
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(feat, label, test_size = 0.25, random_state = 0)
    num_leaves = []
    accuracy_score = []
    auc_score = []
    # for depth in range(1,10):
    #     clf = tree.DecisionTreeClassifier(max_depth = depth)
    #     clf.fit(x_train,y_train)
    #     predictions = clf.predict(x_test)
    #     accuracy = clf.score(x_test,y_test)
    #     auc = metrics.roc_auc_score(y_test,predictions)
    #     num_leaves.append(depth)
    #     accuracy_score.append(accuracy)
    #     auc_score.append(auc)

    for depth in range(1,10):
        clf = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth = depth), n_estimators = 100)
        clf.fit(x_train,y_train)
        predictions = clf.predict(x_test)
        accuracy = clf.score(x_test,y_test)
        auc = metrics.roc_auc_score(y_test,predictions)
        num_leaves.append(depth)
        accuracy_score.append(accuracy)
        auc_score.append(auc)


    return num_leaves,accuracy_score,auc_score
Example #28
0
def runAdaReal(arr):#depth, n_est, filename, lrn_rate=1.0):
    global file_dir, nEvents, solutionFile, counter
    depth = int(arr[0]*100)
    n_est = int(arr[1]*100)
    lrn_rate = arr[2]
    print 'iteration number ' + str(counter)
    counter+=1
    if depth <= 0 or n_est <= 0 or lrn_rate <= 0:
        print 'return 100'
        return 100
    filename =  'adar_dep'+str(depth)+'_est'+str(n_est)+'_lrn'+str(lrn_rate) # low
    bdt_real = AdaBoostClassifier(
        tree.DecisionTreeClassifier(max_depth=depth),
        n_estimators=n_est,
        learning_rate=lrn_rate)
    print "AdaBoostReal training"
    bdt_real.fit(sigtr[train_input].values,sigtr['Label'].values)
    print "AdaBoostReal testing"
    bdt_real_pred = bdt_real.predict(sigtest[train_input].values)
    solnFile(filename,bdt_real_pred,sigtest['EventId'].values)#
    print "AdaBoostReal finished"
    ams_score = ams.AMS_metric(solutionFile, file_dir+filename+'.out', nEvents)
    print ams_score
    logfile.write(filename+': ' + str(ams_score)+'\n')
    return -1.0*float(ams_score)
Example #29
0
def classify(x, y, cv, n_estimator=50):
    acc, prec, recall = [], [], []
    base_clf = DecisionTreeClassifier(
        compute_importances=None,
        criterion="entropy",
        max_depth=1,
        max_features=None,
        max_leaf_nodes=None,
        min_density=None,
        min_samples_leaf=1,
        min_samples_split=2,
        random_state=None,
        splitter="best",
    )

    global clf
    clf = AdaBoostClassifier(base_estimator=base_clf, n_estimators=n_estimator)
    for train, test in cv:
        x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test]
        clf = clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        acc.append(accuracy_score(y_test, y_pred))
        prec.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
    a = np.mean(acc)
    p = np.mean(prec)
    r = np.mean(recall)
    f = 2 * p * r / (p + r)
    return a, p, r, f
def AB_results(): # AdaBoostClassifier
	print "--------------AdaBoostClassifier-----------------"
	rang = [60, 80]
	
	# print "--------------With HOG-----------------"
	# ans = []
	# print "n_estimators	Accuracy"
	# for i in rang:
	# 	clf = AdaBoostClassifier(n_estimators=i)
	# 	clf.fit(X_train_hog, y_train)
	# 	mean_accuracy = clf.score(X_test_hog, y_test)
	# 	print i, "	", mean_accuracy
	# 	ans.append('('+str(i)+", "+str(mean_accuracy)+')')
	# print ans

	# plt.plot(rang, ans, linewidth=2.0)
	# plt.xlabel("n_estimators")
	# plt.ylabel("mean_accuracy")
	# plt.savefig("temp_hog.png")

	
	print "\n--------------Without HOG-----------------"
	ans = []
	print "n_estimators	Accuracy"
	for i in rang:
		clf = AdaBoostClassifier(n_estimators=i)
		clf.fit(X_train, y_train)
		mean_accuracy = clf.score(X_test, y_test)
		print i, "	", mean_accuracy
		ans.append('('+str(i)+", "+str(mean_accuracy)+')')
	print ans
	plt.plot(rang, ans, linewidth=2.0)
	plt.xlabel("n_estimators")
	plt.ylabel("mean_accuracy")
	plt.savefig("temp_plain.png")
from sklearn.ensemble import AdaBoostClassifier

x_train, x_test, y_train, y_test = train_test_split(x, y_class, test_size=0.2)

decision_tree = AdaBoostClassifier(DecisionTreeClassifier(criterion="entropy",
                                                          random_state=0,
                                                          max_depth=3),
                                   n_estimators=20)
decision_tree = decision_tree.fit(x_train, y_train)
train_accuracy = decision_tree.score(x_train, y_train)
test_accuracy = decision_tree.score(x_test, y_test)
Example #32
0
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=42)
classifier.fit(train, yTrain)
y_pred = classifier.predict(test)
result = gen_result(test_id, y_pred)
result.to_csv('./data/submission3.csv', index=False)
print('random forest finished!')

'''
    Ada boost
'''
ada_params = {
    'n_estimators': 200,
    'learning_rate' : 0.75
}

clf = AdaBoostClassifier(**ada_params)
clf.fit(train, yTrain)
y_pred = clf.predict(test)

result = gen_result(test_id, y_pred)
result.to_csv('./data/submission4.csv', index=False)
print('adaboost finished!')

# Vote for the result

res1 = pd.read_csv('./data/submission1.csv')
res2 = pd.read_csv('./data/submission2.csv')
res3 = pd.read_csv('./data/submission3.csv')
res4 = pd.read_csv('./data/submission4.csv')

label1 = np.array(lbl.transform(list(res1.country.values))).reshape(-1, 1)
Example #33
0
model = RandomForestClassifier(n_estimators=2, random_state=1)
model = model.fit(X_train, y_train)

score_report(X_test, y_test)

from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=1, random_state=1)
model = model.fit(X_train, y_train)

score_report(X_test, y_test)

from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(n_estimators=5, random_state=1)
model = model.fit(X_train, y_train)

score_report(X_test, y_test)

from sklearn import svm

model = svm.LinearSVC(C=0.05)
model = model.fit(X_train, y_train)

score_report(X_test, y_test)

model = svm.SVC(kernel='linear', C=0.4)
model = model.fit(X_train, y_train)

score_report(X_test, y_test)
Example #34
0
#ensemble models

models = {}

print "Training on all features"
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1010)

models['RFC'] = RandomForestClassifier(n_estimators=300)
models['XGB'] = xgb.XGBClassifier(max_depth=3,
                                  n_estimators=300,
                                  learning_rate=0.05)
models['GBC'] = GradientBoostingClassifier()
models['ABC'] = AdaBoostClassifier()
models['ETC'] = ExtraTreesClassifier()

for name, model in models.iteritems():
    model.fit(X_train, y_train)
    print name
    print classification_report(y_test, model.predict(X_test))
    print "Accuracy: ", accuracy_score(y_test, model.predict(X_test))
    print '\n'

feature_importances = pd.DataFrame()

for name, model in models.iteritems():
    df = pd.DataFrame(data=model.feature_importances_,
                      index=X_test.columns,
                      columns=[name]).transpose()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################


### your code here!  name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary

#t0 = time()
#knnClf = KNeighborsClassifier()
#knnClf.fit(features_train, labels_train)
#print "default knn training time:", round(time()-t0, 3), "s"

t0 = time()
adaBoostClf = AdaBoostClassifier(n_estimators=30,learning_rate=0.4)
adaBoostClf.fit(features_train, labels_train)
print "default adaBoost training time:", round(time()-t0, 3), "s"

#t0 = time()
#rfClf = RandomForestClassifier()
#rfClf.fit(features_train, labels_train)
#print "default randomForest training time:", round(time()-t0, 3), "s"

#knnPred = knnClf.predict(features_test)
#knnacc = accuracy_score(knnPred, labels_test)

adaBoostPred = adaBoostClf.predict(features_test)
adaBoostacc = accuracy_score(adaBoostPred, labels_test)

#rfPred = rfClf.predict(features_test)
DATA_DIRECTORY = '../data/full'

MAX_FEATURES = 2500

CORPUS = []
VECTORIZER = CountVectorizer(max_features=MAX_FEATURES)

CLASSIFIERS = {
    'GaussianNB': GaussianNB(),
    'MultinomialNB': MultinomialNB(),
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'LogisticRegression': LogisticRegression(),
    'RandomForest': RandomForestClassifier(n_estimators=100),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=100),
    'AdaBoost': AdaBoostClassifier(),
    'MLP': MLPClassifier(max_iter=500),
    'SVC(linear, C=0.025)': SVC(kernel="linear", C=0.025, probability=True)
}

ANEW_EMOTION_DICTIONARY = common_utils.get_anew_emotion_dictionary()


def text_id_to_filename(text_id):
    """
    Creates the full filename for the text_id
    :param text_id: the id
    :return: the full filename
    """
    return DATA_DIRECTORY + '/' + text_id + '.txt'
Example #37
0
    # EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],voting='soft', verbose=0),
    # SVC(kernel="linear", C=0.025),
    ExtraTreesClassifier(n_estimators=150, criterion="entropy", max_depth=None,
                         min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.,
                         max_features="auto", max_leaf_nodes=None, min_impurity_split=1e-7,
                         bootstrap=False, oob_score=False, n_jobs=1, random_state=410,
                         verbose=0, warm_start=False, class_weight=None),
    RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                           max_depth=None, max_features='auto', max_leaf_nodes=None,
                           min_impurity_split=1e-07, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=70, n_jobs=1, oob_score=True, random_state=410,
                           verbose=0, warm_start=False),
    AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion="entropy",
                           splitter="best", max_depth=1, min_samples_split=2,
                           min_samples_leaf=1,min_weight_fraction_leaf=0., max_features=None,
                           random_state=None,max_leaf_nodes=None, min_impurity_split=1e-7,
                           class_weight=None, presort=False),
                    n_estimators=100, learning_rate=0.1,algorithm='SAMME.R', random_state=410),
    GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100,
                               subsample=1.0, criterion='friedman_mse', min_samples_split=2,
                               min_samples_leaf=1, min_weight_fraction_leaf=0.,
                               max_depth=3, min_impurity_split=1e-7, init=None,
                               random_state=410, max_features=None, verbose=0,
                               max_leaf_nodes=None, warm_start=False,
                               presort='auto'),
    LinearDiscriminantAnalysis(solver='eigen', shrinkage=0.001, priors=None,
                               n_components=410, store_covariance=False, tol=1e-4)]

# Logging for Visual Comparison
log_cols = ["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)
Example #38
0
# k-nearest neighbor
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(features_train, labels_train)
print "KNN Accuracy:", neigh.score(features_test, labels_test)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(features_train, labels_train)
print "Random Forest Accuracy:", rfc.score(features_test, labels_test)

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()
abc.fit(features_train, labels_train)
print "AdaBoost Accuracy:", abc.score(features_test, labels_test)

prettyPicture(neigh, features_test, labels_test, "neigh.png")
prettyPicture(rfc, features_test, labels_test, "rfc.png")
prettyPicture(abc, features_test, labels_test, "abc.png")

# for clf in [neigh, rfc, abc]:
#     try:
#         print "plotting"
#         prettyPicture(clf, features_test, labels_test)
#     except NameError:
#         print "passed"
#         pass
Example #39
0
 def init_model(self, settings):
     return AdaBoostClassifier(n_estimators=settings["n_estimators"])
topic_dist_train_all_stars['Sentiment'] = sentiment_predicted_train
topic_dist_test_all_stars['Sentiment'] = sentiment_predicted_test

# Feed in the predicted sentiment as the feature along with the topic distribution (From LDA), for the model to train on
# Use the model to predict star rating from the topic distribution and sentiment of the testing reviews
train_features = topic_dist_train_all_stars
train_lables = stars_label_train_all_stars

test_features = topic_dist_test_all_stars
test_lables = stars_label_test_all_stars

classifiers = [
    MultinomialNB(),
    LogisticRegression(),
    RandomForestClassifier(n_estimators=100, n_jobs=2),
    AdaBoostClassifier(n_estimators=100)
]
classifiers_names = [
    'Multinomial Naive Bayes', 'Logistic Regression', 'Random Forest',
    'AdaBoost'
]

LdaSentimentResults = {}
for (i, classifier) in enumerate(classifiers):
    model = classifier.fit(train_features, train_lables)
    preds = model.predict(test_features)

    precision = metrics.precision_score(test_lables, preds)
    recall = metrics.recall_score(test_lables, preds)
    F1 = metrics.f1_score(test_lables, preds)
    accuracy = accuracy_score(test_lables, preds)
Example #41
0
adasyn = over_sampling.ADASYN()
adasyn_x, adasyn_y = adasyn.fit_sample(train_x,train_y)

print adasyn_x.shape

models = []
models.append(("LR",LogisticRegression()))
models.append(("LDA",LinearDiscriminantAnalysis()))
models.append(("KNN",KNeighborsClassifier()))
models.append(("DCT",DecisionTreeClassifier()))
models.append(("GNB",GaussianNB()))
models.append(("SVC",SVC()))
models.append(("GPC",GaussianProcessClassifier(1.0*RBF(1.0))))
models.append(("MLP",MLPClassifier()))
models.append(("ADB",AdaBoostClassifier()))

for name, model in models:
	training("Normal",name,train_x,train_y)
	training("ROS",name,ROS_x,ROS_y)
	training("SMOTE",name,smote_x,smote_y)
	training("ADASYN",name,adasyn_x,adasyn_y)
	print "----------------------------------------------"

print np.unique(train_y)
weight=class_weight.compute_class_weight("balanced",np.unique(train_y),train_y)
print weight
result = SVC(class_weight={0:weight[0],1:weight[1],2:weight[2]}).fit(train_x,train_y)
print "class_weight metod and svc models result : ",result.score(test_x,test_y)

print('Naive Bayes Accuracy score: ', accuracy_score(Y_test, predictions))
print('Naive Bayes Recall score: ', recall_score(Y_test, predictions))
print('Naive Bayes F measure: ', f1_score(Y_test, predictions))
print('Naive Bayes precision score: ', precision_score(Y_test, predictions))
print()

# Decision Tree Classifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train_cv, Y_train)
predictions = decision_tree.predict(X_test_cv)
print('Decision Tree Accuracy score: ', accuracy_score(Y_test, predictions))
print()

# Adaboost Tree Classifier
adaboost = AdaBoostClassifier()
adaboost.fit(X_train_cv, Y_train)
predictions = adaboost.predict(X_test_cv)
print('Adaboost Accuracy score: ', accuracy_score(Y_test, predictions))
print()

# K Nearest Neighbor classfier
k_nearest = KNeighborsClassifier()
k_nearest.fit(X_train_cv, Y_train)
predictions = k_nearest.predict(X_test_cv)
print('K-Nearest Neighbour Accuracy score: ', accuracy_score(Y_test, predictions))
print()

# Random Forest classifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train_cv, Y_train)
Example #43
0
        anon_text = doc.create_text(anon=True)
        blob = TextBlob(anon_text)
        anon_scores.append(blob.sentiment.subjectivity)

    return pure_scores, anon_scores


if __name__ == "__main__":

    clf_names = [
        "AdaBoostClassifier", "LogisticRegression", "SGDClassifier",
        "LinearSVC", "RandomForest", "GradientBoosting"
    ]
    classifiers = [
        AdaBoostClassifier(),
        LogisticRegression(class_weight="balanced"),
        SGDClassifier(class_weight="balanced"),
        LinearSVC(class_weight="balanced"),
        RandomForestClassifier(class_weight="balanced"),
        GradientBoostingClassifier()
    ]
    """

    test_polarity_azure(clf_path="binary_classification/models/" + clf_names[0] + "_default_flair", clf=classifiers[0],
                        save_path="results/sentiment/azure_baseline_polarity.csv", anon=False)

    #for clf, clf_name in zip(classifiers, clf_names):
    #    print("\nWorking on " + clf_name + "\n")
    #    test_polarity_azure(clf_path="binary_classification/models/" + clf_name + "_default_flair", clf=clf,
    #                        save_path="results/sentiment/azure_" + clf_name + "_polarity.csv")
path = "C://Users//Arushi//PycharmProjects//Final_Thesis_chap1//9//"

for threshold in thresholdRange:
    print(threshold)
    overallPrecision = 0
    overallRecall = 0
    overallAuauc = 0
    overallAccuracy = 0
    overallMc = 0
    for i in range(iter):
        X_train = np.load(path + 'transformed_train_data_' + str(i) + '.npy').astype(float)
        Y_train = np.load(path + 'transformed_train_labels_' + str(i) + '.npy'). astype(float).astype(int)
        X_test = np.load(path + 'transformed_test_data_' + str(i) + '.npy').astype(float)
        Y_test = np.load(path + 'transformed_test_labels_' + str(i) + '.npy').astype(float).astype(int)
        bp = best_params[i]
        clf = AdaBoostClassifier(base_estimator=bp['base_estimator'], n_estimators=bp['n_estimators'],
                                 algorithm=bp['algorithm'], random_state=seed)
        clf_sigmoid = CalibratedClassifierCV(clf, cv=cvCount, method='sigmoid').fit(X_train, Y_train.ravel())
        predictionsProb = clf_sigmoid.predict_proba(X_test)
        predictions = getPredictionsGivenThreshold(predictionsProb, threshold)
        precision = precision_score(Y_test, predictions)
        recall = recall_score(Y_test, predictions)
        auroc = roc_auc_score(Y_test, predictionsProb[:, 1])
        accuracy = accuracy_score(Y_test, predictions)
        matthewsCoeff = matthews_corrcoef(Y_test, predictions)

        overallPrecision += precision
        overallRecall += recall
        overallAuauc += auroc
        overallAccuracy +=accuracy
        overallMc += matthewsCoeff
    thresholdList.append(threshold)
Example #45
0
    [int(bagb_pred[i] != y_test[i]) for i in range(0, ts)])
bagb_error = np.sum(bagb_verror)
bagb_ccidx = np.where(bagb_verror == 0)
bagb_mcidx = np.where(bagb_verror == 1)

print("🌲  ----------Decision Tree Classfication + Bagging----------")
print(bagb_error, "misclassified data out of", ts, "(", bagb_error / ts,
      "%)\n")
'''--------------------
CART (Decision Tree) + Boosting
http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
--------------------'''
# adab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=8), n_estimators=20)
# adab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=20,learning_rate=1.5,algorithm="SAMME") #[dodiku] there are many more parameters we can play with
adab = AdaBoostClassifier(dtc,
                          n_estimators=20,
                          learning_rate=1.5,
                          algorithm="SAMME.R")
# adab = GradientBoostingClassifier(max_depth=5, n_estimators=30)

adab.fit(x_training, y_training)

# Predicting
adab_pred = adab.predict(x_test)

# Finding mispredicted samples
adab_verror = np.asarray(
    [int(adab_pred[i] != y_test[i]) for i in range(0, ts)])
adab_error = np.sum(adab_verror)
adab_ccidx = np.where(adab_verror == 0)
adab_mcidx = np.where(adab_verror == 1)
Example #46
0
    def btnConvert_click(self):
        msgBox = QMessageBox()

        try:
            FoldFrom = np.int32(ui.txtFoldFrom.text())
            FoldTo = np.int32(ui.txtFoldTo.text())
        except:
            print("Please check fold parameters!")
            return

        if FoldTo < FoldFrom:
            print("Please check fold parameters!")
            return

        # Algorithm
        Algorithm = ui.cbAlgorithm.currentText()

        # NEstimators
        try:
            NEstimators = np.int(ui.txtNEstimators.text())
        except:
            msgBox.setText("Number of Estimators is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # LearningRate
        try:
            LearningRate = np.float(ui.txtLearningRate.text())
        except:
            msgBox.setText("Learning Rate is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # Filter
        try:
            Filter = ui.txtFilter.text()
            if not len(Filter):
                Filter = None
            else:
                Filter = Filter.replace("\'", " ").replace(",", " ").replace(
                    "[", "").replace("]", "").split()
                Filter = np.int32(Filter)
        except:
            print("Filter is wrong!")
            return

        # OutFile
        OutFile = ui.txtOutFile.text()
        if not len(OutFile):
            msgBox.setText("Please enter out file!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        Fold = list()
        accuracy = list()
        precision = list()
        average_precision = list()
        f1score = list()
        recall = list()

        accuracyTr = list()
        precisionTr = list()
        average_precisionTr = list()
        f1scoreTr = list()
        recallTr = list()

        InFileList = list()

        OutData = dict()
        OutData["ModelAnalysis"] = "AdaBoost"

        for fold in range(FoldFrom, FoldTo + 1):
            # OutModel
            OutModel = ui.txtOutModel.text()
            if not len(OutModel):
                OutModel = None
            else:
                OutModel = OutModel.replace("$FOLD$", str(fold))

            # InFile
            InFile = ui.txtInFile.text()
            InFile = InFile.replace("$FOLD$", str(fold))
            InFileList.append(InFile)
            if not len(InFile):
                msgBox.setText("Please enter input file!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not os.path.isfile(InFile):
                msgBox.setText("Input file not found!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False

            InData = io.loadmat(InFile)
            # Data
            if not len(ui.txtITrData.currentText()):
                msgBox.setText("Please enter Input Train Data variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtITeData.currentText()):
                msgBox.setText("Please enter Input Test Data variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False

            # Label
            if not len(ui.txtITrLabel.currentText()):
                msgBox.setText("Please enter Train Input Label variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtITeLabel.currentText()):
                msgBox.setText("Please enter Test Input Label variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False

            TrX = InData[ui.txtITrData.currentText()]
            TeX = InData[ui.txtITeData.currentText()]
            TrL = InData[ui.txtITrLabel.currentText()][0]
            TeL = InData[ui.txtITeLabel.currentText()][0]

            try:
                if Filter is not None:
                    for fil in Filter:
                        # Remove Training Set
                        labelIndx = np.where(TrL == fil)[0]
                        TrL = np.delete(TrL, labelIndx, axis=0)
                        TrX = np.delete(TrX, labelIndx, axis=0)
                        # Remove Testing Set
                        labelIndx = np.where(TeL == fil)[0]
                        TeL = np.delete(TeL, labelIndx, axis=0)
                        TeX = np.delete(TeX, labelIndx, axis=0)
                        print("Class ID = " + str(fil) +
                              " is removed from data.")

                if ui.cbScale.isChecked():
                    TrX = preprocessing.scale(TrX)
                    TeX = preprocessing.scale(TeX)
                    print(
                        "Whole of data is scaled Train~N(0,1) and Test~N(0,1)."
                    )
            except:
                print("Cannot load data or label")
                return

            # FoldID
            if not len(ui.txtFoldID.currentText()):
                msgBox.setText("Please enter FoldID variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            try:
                currFID = InData[ui.txtFoldID.currentText()][0][0]
                Fold.append(currFID)
            except:
                print("Cannot load Fold ID!")
                return

            try:
                allvars = dict(locals(), **globals())
                exec(ui.txtBase.toPlainText(), allvars, allvars)
                base = allvars['base']
            except Exception as e:
                print("Event codes generated following error:")
                print(e)
                msgBox = QMessageBox()
                msgBox.setText(str(e))
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return

            try:
                clf = AdaBoostClassifier(base_estimator=base, n_estimators=NEstimators,\
                                         learning_rate=LearningRate,algorithm=Algorithm)
                print("FoldID = " + str(currFID) + " is training ...")
                clf.fit(TrX, TrL)
                if OutModel is not None:
                    joblib.dump(clf, OutModel)
                    print("FoldID = " + str(currFID) + " Model is saved: " +
                          OutModel)

                print("FoldID = " + str(currFID) + " is testing ...")
                PeL = clf.predict(TeX)
                PrL = clf.predict(TrX)

                OutData["confusion_matrix"] = confusion_matrix(
                    TeL, PeL, np.unique(TeL))

            except Exception as e:
                print(e)
                msgBox = QMessageBox()
                msgBox.setText(str(e))
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return

            if ui.cbAverage.isChecked():
                acc = accuracy_score(TeL, PeL)
                accTr = accuracy_score(TrL, PrL)
                accuracy.append(acc)
                accuracyTr.append(accTr)
                print(
                    "FoldID = {:d}, Average            Train {:5.2f} Test {:5.2f}"
                    .format(currFID, accTr * 100, acc * 100))

            if ui.cbPrecision.isChecked():
                pre = precision_score(TeL,
                                      PeL,
                                      average=ui.cbPrecisionAvg.currentData())
                preTr = precision_score(
                    TrL, PrL, average=ui.cbPrecisionAvg.currentData())
                precision.append(pre)
                precisionTr.append(preTr)
                print(
                    "FoldID = {:d}, Precision          Train {:5.2f} Test {:5.2f}"
                    .format(currFID, preTr * 100, pre * 100))

            if ui.cbAPrecision.isChecked():
                prA = average_precision_score(
                    TeL, PeL, average=ui.cbAPrecisionAvg.currentData())
                prATr = average_precision_score(
                    TrL, PrL, average=ui.cbAPrecisionAvg.currentData())
                average_precision.append(prA)
                average_precisionTr.append(prATr)
                print(
                    "FoldID = {:d}, Average Precision: Train {:5.2f} Test {:5.2f}"
                    .format(currFID, prATr * 100, prA * 100))

            if ui.cbRecall.isChecked():
                rec = recall_score(TeL,
                                   PeL,
                                   average=ui.cbRecallAvg.currentData())
                recTr = recall_score(TrL,
                                     PrL,
                                     average=ui.cbRecallAvg.currentData())
                recall.append(rec)
                recallTr.append(recTr)
                print(
                    "FoldID = {:d}, Recall:            Train {:5.2f} Test {:5.2f}"
                    .format(currFID, recTr * 100, rec * 100))

            if ui.cbF1.isChecked():
                f1 = f1_score(TeL, PeL, average=ui.cbF1Avg.currentData())
                f1Tr = f1_score(TrL, PrL, average=ui.cbF1Avg.currentData())
                f1score.append(f1)
                f1scoreTr.append(f1Tr)
                print(
                    "FoldID = {:d}, F1:                Train {:5.2f} Test {:5.2f}"
                    .format(currFID, f1Tr * 100, f1 * 100))

            print("FoldID = " + str(currFID) + " is analyzed!")

        if ui.cbAverage.isChecked():
            OutData["FoldAccuracy"] = accuracy
            MeanAcc = np.mean(accuracy)
            OutData["MeanTestAccuracy"] = MeanAcc
            STDAcc = np.std(accuracy)
            OutData["StdTestAccuracy"] = STDAcc
            MeanAccTr = np.mean(accuracyTr)
            OutData["MeanTrainAccuracy"] = MeanAccTr
            STDAccTr = np.std(accuracyTr)
            OutData["StdTrainAccuracy"] = STDAccTr
            print(
                "Accuracy:         Train {:5.2f} +/- {:4.2f} Test {:5.2f} +/- {:4.2f}"
                .format(MeanAccTr * 100, STDAccTr, MeanAcc * 100, STDAcc))

        if ui.cbPrecision.isChecked():
            OutData["ModePrecision"] = ui.cbPrecisionAvg.currentText()
            OutData["FoldPrecision"] = precision
            MeanPre = np.mean(precision)
            OutData["MeanTrainPrecision"] = MeanPre
            STDPre = np.std(precision)
            OutData["StdTrainPrecision"] = STDPre
            MeanPreTr = np.mean(precisionTr)
            OutData["MeanTestPrecision"] = MeanPreTr
            STDPreTr = np.std(precisionTr)
            OutData["StdTestPrecision"] = STDPreTr
            print(
                "Precision:        Train {:5.2f} +/- {:4.2f} Test {:5.2f} +/- {:4.2f}"
                .format(MeanPreTr * 100, STDPreTr, MeanPre * 100, STDPre))

        if ui.cbAPrecision.isChecked():
            OutData["ModeAveragePrecision"] = ui.cbAPrecisionAvg.currentText()
            OutData["FoldAveragePrecision"] = average_precision
            MeanAPre = np.mean(average_precision)
            OutData["MeanTrainAveragePrecision"] = MeanAPre
            STDAPre = np.std(average_precision)
            OutData["StdTestAveragePrecision"] = STDAPre
            MeanAPreTr = np.mean(average_precisionTr)
            OutData["MeanTrainAveragePrecision"] = MeanAPreTr
            STDAPreTr = np.std(average_precisionTr)
            OutData["StdTrainAveragePrecision"] = STDAPreTr
            print(
                "AveragePrecision: Train {:5.2f} +/- {:4.2f} Test {:5.2f} +/- {:4.2f}"
                .format(MeanAPreTr * 100, STDAPreTr, MeanAPre * 100, STDAPre))

        if ui.cbRecall.isChecked():
            OutData["ModeRecall"] = ui.cbRecallAvg.currentText()
            OutData["FoldRecall"] = recall
            MeanRec = np.mean(recall)
            OutData["MeanTestRecall"] = MeanRec
            STDRec = np.std(recall)
            OutData["StdTestRecall"] = STDRec
            MeanRecTr = np.mean(recallTr)
            OutData["MeanTrainRecall"] = MeanRecTr
            STDRecTr = np.std(recallTr)
            OutData["StdTrainRecall"] = STDRecTr
            print(
                "Recall:           Train {:5.2f} +/- {:4.2f} Test {:5.2f} +/- {:4.2f}"
                .format(MeanRecTr * 100, STDRecTr, MeanRec * 100, STDRec))

        if ui.cbF1.isChecked():
            OutData["ModeF1"] = ui.cbF1Avg.currentText()
            OutData["FoldF1"] = f1score
            MeanF1 = np.mean(f1score)
            OutData["MeanTestF1"] = MeanF1
            STDF1 = np.std(f1score)
            OutData["StdTestF1"] = STDF1
            MeanF1Tr = np.mean(f1scoreTr)
            OutData["MeanTrainF1"] = MeanF1Tr
            STDF1Tr = np.std(f1scoreTr)
            OutData["StdTrainF1"] = STDF1Tr
            print(
                "F1:               Train {:5.2f} +/- {:4.2f} Test {:5.2f} +/- {:4.2f}"
                .format(MeanF1Tr * 100, STDF1Tr, MeanF1 * 100, STDF1))

        OutData["InputFiles"] = InFileList

        print("Saving ...")
        io.savemat(OutFile, mdict=OutData)
        print("DONE.")
        msgBox.setText("AdaBoost Classification is done.")
        msgBox.setIcon(QMessageBox.Information)
        msgBox.setStandardButtons(QMessageBox.Ok)
        msgBox.exec_()
Example #47
0
test = pd.read_csv("test.csv")

#Select features and parse the result to pandas dataframe
X_test = pd.DataFrame(test.loc[:, features].values)

#Load targets for test
submission = pd.read_csv("gender_submission.csv")

#Select the target column
Y_test = submission.loc[:, "Survived"].values

#Slpit train data into features and targets for train
X_train = pd.DataFrame(train.loc[:, features].values)
Y_train = train.loc[:, "Survived"].values

#Data encoding : since machine learning work just with number we're going to parse strings to numeric values using Label Encoder
le = preprocessing.LabelEncoder()
X_train = X_train.apply(le.fit_transform)
X_test = X_test.apply(le.fit_transform)

#Create a Ada Boost Classifier instance
classifier = AdaBoostClassifier()

#Fit the classifier
classifier.fit(X_train, Y_train)

#Calculate the score (Accuracy)
score = classifier.score(X_test, Y_test)

#Printing the score
print(score)
def train():
    # if os.path.exists('dataset/per_feature_matrix'):
    #     per_feature_matrix = pickle.load(open('dataset/per_feature_matrix', 'rb'))
    # else:
    start = time.time()
    print "extracting feature matrix..."
    if 1:
        per_feature_matrix = {}
        for each in os.listdir('dataset/per_feature'):
            path = os.path.join('dataset/per_feature/', each)
            per_feature_matrix = dict(pickle.load(open(path, 'rb')),
                                      **per_feature_matrix)
        per_feature_matrix = per_feature_matrix.values()
        pickle.dump(per_feature_matrix, open('dataset/per_feature_matrix',
                                             'wb'))

    # if os.path.exists('dataset/api_feature_matrix'):
    #     api_feature_matrix = pickle.load(open('dataset/api_feature_matrix', 'rb'))
    # else:
    if 1:
        api_feature_matrix = {}
        for each in os.listdir('dataset/api_feature'):
            path = os.path.join('dataset/api_feature/', each)
            api_feature_matrix = dict(pickle.load(open(path, 'rb')),
                                      **api_feature_matrix)
        api_feature_matrix = api_feature_matrix.values()
        pickle.dump(api_feature_matrix, open('dataset/api_feature_matrix',
                                             'wb'))

    # if os.path.exists('dataset/ngram_feature_matrix'):
    #     ngram_feature_matrix = pickle.load(open('dataset/ngram_feature_matrix', 'rb'))
    # else:
    if 1:
        ngram_feature_matrix = {}
        for each in os.listdir('dataset/ngram_feature'):
            path = os.path.join('dataset/ngram_feature/', each)
            ngram_feature_matrix = dict(pickle.load(open(path, 'rb')),
                                        **ngram_feature_matrix)
        ngram_feature_matrix = ngram_feature_matrix.values()
        pickle.dump(ngram_feature_matrix,
                    open('dataset/ngram_feature_matrix', 'wb'))

    classification = pickle.load(open('dataset/classification', 'rb'))
    if per_feature_matrix is not None and api_feature_matrix is not None and ngram_feature_matrix is not None:
        feature_matrix = _concatenate(per_feature_matrix, api_feature_matrix,
                                      ngram_feature_matrix)
    elif per_feature_matrix is not None:
        feature_matrix = per_feature_matrix
    elif api_feature_matrix is not None:
        feature_matrix = api_feature_matrix
    elif ngram_feature_matrix is not None:
        feature_matrix = ngram_feature_matrix
    else:
        return
    print "extracting feature matrix done."
    print "处理前样本总数:%d" % len(feature_matrix)

    #print len(feature_matrix)
    #print len(classification)

    features = 400
    fsmodel = SelectKBest(chi2, k=features)
    raw_feature_matrix = feature_matrix
    feature_matrix = fsmodel.fit_transform(feature_matrix, classification)

    pickle.dump(fsmodel, open('dataset/fsmodel', 'wb'))

    features = 300
    svc = SVC(kernel="linear", C=1)
    fsmodel2 = RFE(estimator=svc, n_features_to_select=features, step=1)

    #########################    DEBUG    ############################
    #classification = classification[7:]
    ##################################################################
    feature_matrix = fsmodel2.fit_transform(feature_matrix, classification)

    pickle.dump(fsmodel2, open('dataset/fsmodel2', 'wb'))

    #########################    DEBUG    ############################
    b_s = 5  #改这里也要改dl.py里面的默认值
    length = len(feature_matrix)
    feature_matrix = feature_matrix[length % b_s:]
    raw_feature_matrix = raw_feature_matrix[length % b_s:]
    classification = classification[length % b_s:]
    print "处理后样本总数:%d" % len(feature_matrix)
    ##################################################################

    #########################    DEBUG    ############################
    fs_vec = []
    for i in range(len(raw_feature_matrix[0])):
        fs_vec.append(i)  #构造值等于编号的特殊向量

    fs_vec = fsmodel.transform(fs_vec)
    #print fs_vec
    fs_vec = fsmodel2.transform(fs_vec)
    #print fs_vec

    feature_matrix_dl = [x for x in range(len(raw_feature_matrix))]
    for i in range(len(feature_matrix_dl)):
        feature_matrix_dl[i] = [
            x for x in range(len(raw_feature_matrix[0]) - features)
        ]
    temp = 0
    for i in range(len(raw_feature_matrix[0])):
        if i not in fs_vec:
            print "第%d列特征没有选用" % i
            for j in range(len(feature_matrix_dl)):
                feature_matrix_dl[j][temp] = raw_feature_matrix[j][i]
            temp = temp + 1

    #print "行数%d" % len(feature_matrix_dl)
    #print "列数%d" % len(feature_matrix_dl[0])
    #print feature_matrix_dl

    ##################################################################
    #hiddeny, da = test_dA(feature_matrix_dl, len(feature_matrix_dl[0]))
    # hiddeny2, test = test_dA(feature_matrix,len(feature_matrix[0]), batch_size=6, da_object = da)
    hiddeny, da = test_rbm(feature_matrix_dl, len(feature_matrix_dl[0]))
    #print len(feature_matrix)
    print "浅度特征数:%d" % len(feature_matrix[0])
    #print len(hiddeny)
    print "深度特征数:%d" % len(hiddeny[0])
    # print (hiddeny == hiddeny2).all()

    #固化深度训练器
    pickle.dump(da, open('dataset/rbmmodel', 'wb'))

    # 深度特征融合
    feature_matrix = numpy.concatenate((feature_matrix, hiddeny), axis=1)

    Z = []
    count = 0
    for i in feature_matrix:
        Z.append([])
        for j in i:
            Z[count].append(j)

        count += 1

    feature_matrix = Z

    # print feature_matrix

    Z = []
    for i in classification:
        Z.append(int(i))

    classification = Z

    if 1:
        per_feature_matrix2 = {}
        for each in os.listdir('test/per_feature'):
            path = os.path.join('test/per_feature/', each)
            per_feature_matrix2 = dict(pickle.load(open(path, 'rb')),
                                       **per_feature_matrix2)
        per_feature_matrix2 = per_feature_matrix2.values()
        pickle.dump(per_feature_matrix2, open('test/per_feature_matrix', 'wb'))

    # if os.path.exists('dataset/api_feature_matrix'):
    #     api_feature_matrix = pickle.load(open('dataset/api_feature_matrix', 'rb'))
    # else:
    if 1:
        api_feature_matrix2 = {}
        for each in os.listdir('test/api_feature'):
            path = os.path.join('test/api_feature/', each)
            api_feature_matrix2 = dict(pickle.load(open(path, 'rb')),
                                       **api_feature_matrix2)
        api_feature_matrix2 = api_feature_matrix2.values()
        pickle.dump(api_feature_matrix2, open('test/api_feature_matrix', 'wb'))

    # if os.path.exists('dataset/ngram_feature_matrix'):
    #     ngram_feature_matrix = pickle.load(open('dataset/ngram_feature_matrix', 'rb'))
    # else:
    if 1:
        ngram_feature_matrix2 = {}
        for each in os.listdir('test/ngram_feature'):
            path = os.path.join('test/ngram_feature/', each)
            ngram_feature_matrix2 = dict(pickle.load(open(path, 'rb')),
                                         **ngram_feature_matrix2)
        ngram_feature_matrix2 = ngram_feature_matrix2.values()
        pickle.dump(ngram_feature_matrix2,
                    open('test/ngram_feature_matrix', 'wb'))

    classification2 = pickle.load(open('test/classification', 'rb'))
    if per_feature_matrix2 is not None and api_feature_matrix2 is not None and ngram_feature_matrix2 is not None:
        feature_matrix2 = _concatenate(per_feature_matrix2,
                                       api_feature_matrix2,
                                       ngram_feature_matrix2)
    elif per_feature_matrix2 is not None:
        feature_matrix2 = per_feature_matrix2
    elif api_feature_matrix2 is not None:
        feature_matrix2 = api_feature_matrix2
    elif ngram_feature_matrix2 is not None:
        feature_matrix2 = ngram_feature_matrix2
    else:
        return
    print "extracting feature matrix done."
    print "处理前样本总数:%d" % len(feature_matrix2)

    #print len(feature_matrix)
    #print len(classification)

    features = 400
    fsmodel2 = SelectKBest(chi2, k=features)
    raw_feature_matrix2 = feature_matrix2
    feature_matrix2 = fsmodel.fit_transform(feature_matrix2, classification2)

    features2 = 300
    svc = SVC(kernel="linear", C=1)
    fsmodel2 = RFE(estimator=svc, n_features_to_select=features2, step=1)
    feature_matrix2 = fsmodel2.fit_transform(feature_matrix2, classification2)

    #########################    DEBUG    ############################
    b_s = 5  #改这里也要改dl.py里面的默认值
    length = len(feature_matrix2)
    feature_matrix2 = feature_matrix2[length % b_s:]
    raw_feature_matrix2 = raw_feature_matrix2[length % b_s:]
    classification2 = classification2[length % b_s:]
    print "处理后样本总数:%d" % len(feature_matrix2)
    ##################################################################

    #########################    DEBUG    ############################
    fs_vec2 = []
    for i in range(len(raw_feature_matrix2[0])):
        fs_vec2.append(i)  #构造值等于编号的特殊向量

    fs_vec2 = fsmodel.transform(fs_vec2)
    #print fs_vec
    fs_vec2 = fsmodel2.transform(fs_vec2)
    #print fs_vec

    feature_matrix_dl2 = [x for x in range(len(raw_feature_matrix2))]
    for i in range(len(feature_matrix_dl2)):
        feature_matrix_dl2[i] = [
            x for x in range(len(raw_feature_matrix2[0]) - features2)
        ]
    temp = 0
    for i in range(len(raw_feature_matrix2[0])):
        if i not in fs_vec2:
            print "第%d列特征没有选用" % i
            for j in range(len(feature_matrix_dl2)):
                feature_matrix_dl2[j][temp] = raw_feature_matrix2[j][i]
            temp = temp + 1

    hiddeny2, da = test_rbm(feature_matrix_dl2, len(feature_matrix_dl2[0]))
    #print len(feature_matrix)
    print "浅度特征数:%d" % len(feature_matrix2[0])
    #print len(hiddeny)
    print "深度特征数:%d" % len(hiddeny2[0])
    # print (hiddeny == hiddeny2).all()

    # 深度特征融合
    feature_matrix2 = numpy.concatenate((feature_matrix2, hiddeny2), axis=1)

    Z = []
    count = 0
    for i in feature_matrix2:
        Z.append([])
        for j in i:
            Z[count].append(j)

        count += 1

    feature_matrix2 = Z

    # print feature_matrix

    Z = []
    for i in classification2:
        Z.append(int(i))

    classification2 = Z
    '''
    kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)
    print "\nlearning with RF..."
    rf = RandomForestClassifier(n_estimators=300, min_samples_split=10)
    rf.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    scores = cross_validation.cross_val_score(rf, feature_matrix2, classification2, cv=kf)
    print scores
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    print "learning with RF done.\n"
    pickle.dump(rf, open('dataset/model', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with GBDT..."
    gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0,
    max_depth=100, min_samples_split=10, random_state=0)
    gbdt.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    scores = cross_validation.cross_val_score(gbdt, feature_matrix2, classification2, cv=kf)
    print scores
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    print "learning with GBDT done.\n"
    pickle.dump(gbdt, open('dataset/model2', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with AdaBoost..."
    ada = AdaBoostClassifier(n_estimators=300)
    ada.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    scores = cross_validation.cross_val_score(ada, feature_matrix2, classification2, cv=kf)
    print scores
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    print "learning with AdaBoost done.\n"
    pickle.dump(ada, open('dataset/model3', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with LogisticRegression..."
    lr = LogisticRegression()
    lr.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    scores = cross_validation.cross_val_score(lr, feature_matrix2, classification2, cv=kf)
    print scores
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    print "learning with LogisticRegression done.\n"
    pickle.dump(lr, open('dataset/model4', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
    
    kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)
    print "\nlearning with RF..."
    rf = RandomForestClassifier(n_estimators=300, min_samples_split=10)
    rf.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(rf, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with RF done.\n"
    pickle.dump(rf, open('dataset/model', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with GBDT..."
    gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0,
    max_depth=100, min_samples_split=10, random_state=0)
    gbdt.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(gbdt, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with GBDT done.\n"
    pickle.dump(gbdt, open('dataset/model2', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with AdaBoost..."
    ada = AdaBoostClassifier(n_estimators=300)
    ada.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(ada, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with AdaBoost done.\n"
    pickle.dump(ada, open('dataset/model3', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with LogisticRegression..."
    lr = LogisticRegression()
    lr.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(lr, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with LogisticRegression done.\n"
    pickle.dump(lr, open('dataset/model4', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
    '''
    '''
    kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)
    print "\nlearning with SVC..."
    slffork=SVC(kernel='rbf',probability = True)
    slffork.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(slffork, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with SVC done.\n"
    pickle.dump(slffork, open('dataset/model2', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
    '''
    '''
    print "learning with BaggingClassifier..."
    kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)
    baggingfork = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5,max_features=0.5)
    baggingfork.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(baggingfork, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with BaggingClassifier done.\n"
    pickle.dump(baggingfork, open('dataset/model2', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
    '''
    '''kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)'''
    rf = RandomForestClassifier(n_estimators=300, min_samples_split=10)
    gbdt = GradientBoostingClassifier(n_estimators=300,
                                      learning_rate=1.0,
                                      max_depth=100,
                                      min_samples_split=10,
                                      random_state=0)
    ada = AdaBoostClassifier(n_estimators=300)
    #slf1=SVC(kernel='rbf',probability = True)
    bagging = BaggingClassifier(KNeighborsClassifier(),
                                max_samples=0.5,
                                max_features=0.5)

    print "learning with Voting Classifier..."
    vc = VotingClassifier(estimators=[('rf', rf), ('ada', ada),
                                      ('bagging', bagging), ('gbdt', gbdt)],
                          voting='soft',
                          weights=[1.5, 1.5, 1.3, 1.5])
    vc.fit(feature_matrix, classification)
    '''
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(vc, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    '''
    print "learning with Ensemble Classifier done.\n"
    pickle.dump(vc, open('dataset/model_final', 'wb'))  # 固化训练结果
    print 'time :%f' % (time.time() - start)
Example #49
0
def class34(filename, i):
    ''' This function performs experiment 3.4
    
    Parameters
       filename : string, the name of the npz file from Task 2
       i: int, the index of the supposed best classifier (from task 3.1)  
        '''
    i = i - 1
    data = np.load(filename)["arr_0"]

    X = []
    y = []

    for d in data:
        X.append(d[0:173])
        y.append(d[173])

    X = np.array(X)
    y = np.array(y)

    classifiers = [
        SVC(kernel='linear', max_iter=1000),
        SVC(gamma=2, max_iter=1000),
        RandomForestClassifier(max_depth=5, n_estimators=10),
        MLPClassifier(alpha=0.05),
        AdaBoostClassifier()
    ]

    kf = KFold(n_splits=5, shuffle=True)

    # global list to store result
    fold_test_result_list = []
    p_values = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        accuracy_list = []
        for clf in classifiers:
            classifier = clone(clf)

            classifier.fit(X_train, y_train)
            prediction = classifier.predict(X_test)
            c_m = confusion_matrix(y_test, prediction)
            accuracy_list.append(accuracy(c_m))

        fold_test_result_list.append(accuracy_list)

    vertical_result = np.transpose(fold_test_result_list)

    # compare the result with the best classifier
    for j in range(len(classifiers)):
        if i != j:
            S = stats.ttest_rel(vertical_result[i], vertical_result[j])
            p_values.append(S[1])

    with open('a1_3.4.csv', 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')
        for result in fold_test_result_list:
            spamwriter.writerow(result)
        spamwriter.writerow(p_values)

        spamwriter.writerow([
            "The accuracy of the cross-validation's result may lead different result as part3.1 "
            +
            "It could be caused by the variance of the data. In the 3.1, there are only one set of training"
            " and testing data. The form of the trianing set may lead to bias."
        ])
Example #50
0
    for pre, real in zip(predictions, input_y):
        if pre == real and real == 0:
            count += 1
    return count
def getFN(predictions,input_y):
    count = 0
    for pre, real in zip(predictions, input_y):
        if pre == 0 and real == 1:
            count += 1
    return count


if __name__ == "__main__":

    rf_model = RandomForestClassifier()
    adb_model = AdaBoostClassifier()
    gdbc_model = GradientBoostingClassifier()
    et_model = ExtraTreesClassifier()
    svc_model = SVC()


    for myK in range(5):
        train_x, test_x, train_y, test_y = get_dataset('../Features/features2.txt',myK=myK)

        train_sets = []
        test_sets = []
        for clf in [rf_model, adb_model, gdbc_model, et_model, svc_model]:
            train_set, test_set = get_stacking(clf, train_x, train_y, test_x)
            train_sets.append(train_set)
            test_sets.append(test_set)
Example #51
0
test_x = data[len(train):(len(train) + len(test))].drop(target, axis=1)

##################
#2 Model data
###################

#import sys
#sys.path.insert(0, '../helper')
#from meta_predictor import BestRegressor

#meta = BestRegressor(train_x, train_y, 4, 'r2', 0)
#meta.evaluate()

regs = []
regs.append(DecisionTreeClassifier())
regs.append(AdaBoostClassifier(n_estimators=120, learning_rate=0.2))
regs.append(RandomForestClassifier(n_estimators=50, max_depth=8))
regs.append(GradientBoostingClassifier(n_estimators=150, max_depth=3))
regs.append( XGBClassifier(n_estimators=275,max_depth=3,\
      early_stopping_rounds=5) )
infos = []
for r in regs:
    infos.append(r.__class__.__name__)

train_y = train_y.astype(bool)
qmetric = 'accuracy'
for reg, info in zip(regs, infos):
    scores = cross_val_score(reg, train_x, train_y, cv=7, scoring=qmetric)
    print("%s: %0.3f (+/- %0.2f) [%s]"%(qmetric,scores.mean(),scores.std(),\
      info))
Example #52
0
def class31(filename):
    ''' This function performs experiment 3.1
    
    Parameters
       filename : string, the name of the npz file from Task 2

    Returns:      
       X_train: NumPy array, with the selected training features
       X_test: NumPy array, with the selected testing features
       y_train: NumPy array, with the selected training classes
       y_test: NumPy array, with the selected testing classes
       i: int, the index of the supposed best classifier
    '''
    data = np.load(filename)["arr_0"]

    X = []
    y = []

    random.shuffle(data)
    for d in data:
        X.append(d[0:173])
        y.append(d[173])

    # splits data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        train_size=0.8)

    classifiers = [
        SVC(kernel='linear', max_iter=1000),
        SVC(gamma=2, max_iter=1000),
        RandomForestClassifier(max_depth=5, n_estimators=10),
        MLPClassifier(alpha=0.05),
        AdaBoostClassifier()
    ]

    accuracy_list = []
    recall_list = []
    precision_list = []
    cm_list = []

    for classifier in classifiers:
        classifier.fit(X_train, y_train)
        prediction = classifier.predict(X_test)
        c_m = confusion_matrix(y_test, prediction)
        cm_list.append(c_m)
        accuracy_list.append(accuracy(c_m))
        recall_list.append(recall(c_m))
        precision_list.append(precision(c_m))

    with open('a1_3.1.csv', 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')

        for i in range(len(accuracy_list)):
            spamwriter.writerow([i + 1] + [accuracy_list[i]] + recall_list[i] +
                                precision_list[i] +
                                cm_list[i].ravel().tolist())

    iBest = np.argmax(accuracy_list) + 1

    return (X_train, X_test, y_train, y_test, iBest)
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    #ID Models I want to run
    model_list = [
        LogisticRegression(),
        LogisticRegression(penalty='l1'),
        RandomForestClassifier(n_jobs=-1),
        RandomForestClassifier(n_jobs=-1,
                               n_estimators=1000,
                               min_samples_leaf=2),
        GradientBoostingClassifier(),
        GradientBoostingClassifier(n_estimators=100,
                                   min_samples_leaf=2,
                                   max_depth=5,
                                   learning_rate=.01),
        AdaBoostClassifier(),
        AdaBoostClassifier(n_estimators=100, learning_rate=.01)
    ]
    title_list = [
        'Logistic Regression Base Model',
        'Logistic Regression - L1 Penalty Added',
        'Random Forest Classifier Base Model',
        'Random Forest - Higher N-estimators, pruning trees',
        'Gradient Boosting Classifier Base Model',
        'Gradient Boosting Classifier - Higher n_estimators and min_samples changed',
        'Adaptive Boosting Classifier Base Model',
        'Adaptive Boosting Classifier - Higher n_estimators and learning rate changed'
    ]

    ##Start NLP modeling using ridenotes column
Example #54
0
def adaboost(train_x, train_y, test_x, test_y, msno_df):
    print("Adaboost")
    clf = AdaBoostClassifier(base_estimator=LogisticRegression(), learning_rate=1.0, n_estimators=200, algorithm='SAMME.R')
    checkResult(clf, "Adaboost", train_x, train_y, test_x, test_y, msno_df)
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

X, y = make_gaussian_quantiles(n_samples=13000, n_features=10,
                               n_classes=3, random_state=1)

n_split = 3000

X_train, X_test = X[:n_split], X[n_split:]
y_train, y_test = y[:n_split], y[n_split:]

print(X_train.shape)
print(y_train.shape)
bdt_real = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    n_estimators=600,
    learning_rate=1)

bdt_discrete = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    n_estimators=600,
    learning_rate=1.5,
    algorithm="SAMME")

bdt_real.fit(X_train, y_train)
bdt_discrete.fit(X_train, y_train)

real_test_errors = []
discrete_test_errors = []

for real_test_predict, discrete_train_predict in zip(bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):
Example #56
0
def setup_clf_list():
    """
    Instantiates all classifiers of interstes to be used.
    """
    # List of tuples of a classifier and its parameters.
    clf_list = []

    #
    # clf_naive = GaussianNB()
    # params_naive = {}
    # clf_list.append( (clf_naive, params_naive) )

    #
    clf_tree = DecisionTreeClassifier()
    params_tree = {
        "min_samples_split": [2, 5, 10, 20],
        "criterion": ('gini', 'entropy')
    }
    clf_list.append((clf_tree, params_tree))

    #
    clf_linearsvm = LinearSVC()
    params_linearsvm = {
        "C": [0.5, 1, 5, 10, 100, 10**10],
        "tol": [10**-1, 10**-10]
        #,"class_weight":['auto']
    }
    clf_list.append((clf_linearsvm, params_linearsvm))

    #
    clf_adaboost = AdaBoostClassifier()
    params_adaboost = {"n_estimators": [20, 25, 30, 40, 50, 100]}
    clf_list.append((clf_adaboost, params_adaboost))

    #
    clf_random_tree = RandomForestClassifier()
    params_random_tree = {
        "n_estimators": [2, 3, 5],
        "criterion": ('gini', 'entropy')
    }
    clf_list.append((clf_random_tree, params_random_tree))

    #
    clf_knn = KNeighborsClassifier()
    params_knn = {"n_neighbors": [2, 5], "p": [2, 3]}
    clf_list.append((clf_knn, params_knn))

    #
    clf_log = LogisticRegression()
    params_log = {
        "C": [0.05, 0.5, 1, 10, 10**2, 10**5, 10**10, 10**20],
        "tol": [10**-1, 10**-5, 10**-10]
        #,"class_weight":['auto']
    }
    clf_list.append((clf_log, params_log))

    #
    clf_lda = LinearDiscriminantAnalysis()
    params_lda = {"n_components": [0, 1, 2, 5, 10]}
    clf_list.append((clf_lda, params_lda))

    #
    logistic = LogisticRegression()
    rbm = BernoulliRBM()
    clf_rbm = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    params_rbm = {
        "logistic__tol": [10**-10, 10**-20],
        "logistic__C": [0.05, 0.5, 1, 10, 10**2, 10**5, 10**10, 10**20]
        #,"logistic__class_weight":['auto']
        ,
        "rbm__n_components": [2, 3, 4]
    }
    clf_list.append((clf_rbm, params_rbm))

    return clf_list
Example #57
0
def execute(self, parameters, messages):
    """
        Model Train tool
            Trains one of the predefined models with their respective parameters. This tool should be executed from a 
                python toolbox 
            Currently supports for Adaboost(SAMME), BrownBoost, logistic regression, random forest and support vector machine
            New models need to have implemented the methods  fit, get_params, predict and one of predict_proba or decision_function
            Additionally, it can implement feature_importances_ 
            
        :param parameters: parameters object with all the parameters from the python-tool. It necessarily contains
            train_points: (Points) Points that will be used for the training 
            train_regressors: (Field) Name of the regressors fields that will be used for the training 
            train_response: (Field) Name of the response/class field that will be used for the training 
            output_model: (File path) Name of the file where the model will be stored
            leave_one_out: (Boolean) Choose between test with leave-one-out (true) or 3-fold cross-validation (false)  
            classifier_name: (String) Name of the model to be trained 
            
        :param messages: messages object to print in the console, must implement AddMessage 
        
        :return: None
    """
    global MESSAGES
    MESSAGES = messages
    # Print parameters for debugging purposes
    print_parameters(parameters)

    # Decompose the parameters object and assign the value to variables
    parameter_dic = {par.name: par for par in parameters}
    classifier_name = parameter_dic["classifier_name"].valueAsText
    train_points = parameter_dic["train_points"].valueAsText
    train_regressors_name = parameter_dic["train_regressors"].valueAsText.split(";")
    train_response_name = parameter_dic["train_response"].valueAsText
    output_model = parameter_dic["output_model"].valueAsText
    leave_one_out = parameter_dic["leave_one_out"].value

    # Check for correctness in the parameters
    _input_validation(parameters)

    train_regressors = _get_fields(train_points, train_regressors_name)
    train_response = _get_fields(train_points, train_response_name)

    # Choice of the model type, the specific parameters are then passed to variables
    if classifier_name == "Adaboost":
        """
            Parameters:
                num_estimators: (Integer) Number of estimators to be used 
                learning_rate: (Float) Learning rate of the model           
            For more information about the model visit 
            http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
        """
        _verbose_print("Adaboost selected")
        num_estimators = parameter_dic["num_estimators"].value
        learning_rate = parameter_dic["learning_rate"].value
        classifier = AdaBoostClassifier(base_estimator=None, n_estimators=num_estimators, learning_rate=learning_rate,
                                        algorithm='SAMME.R', random_state=None)

    elif classifier_name == "Logistic Regression":
        """
            Parameters:
                deposit_weight: (Integer) weight to be given to the deposits to deal with unbalanced data 
                penalty: (string) type of norm for the penalty 
                random_state: (Integer) seed for random generator, useful to obtain reproducible results 
                
            For more information about the model visit 
            http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
        """
        _verbose_print("Logistic Regression selected")
        penalty = parameter_dic["penalty"].valueAsText
        deposit_weight = parameter_dic["deposit_weight"].value
        random_state = parameter_dic["random_state"].value
        if deposit_weight is None:
            _verbose_print("deposit_weight is None, balanced wighting will be used")
            class_weight = "balanced"
        else:
            class_weight = {1: float(deposit_weight), -1: (100-float(deposit_weight))}

        classifier = LogisticRegression(penalty=penalty, dual=False, tol=0.0001, C=1, fit_intercept=True,
                                        intercept_scaling=1, class_weight=class_weight, random_state=random_state,
                                        solver='liblinear', max_iter=100, multi_class='ovr', verbose=0,
                                        warm_start=False, n_jobs=1)

    elif classifier_name == "Brownboost":
        """
            Parameters:
                countdown: (Float) Initial value of the countdown timer
        """
        _verbose_print("BrownBoost selected")
        countdown = parameter_dic["countdown"].value
        classifier = BrownBoostClassifier(base_estimator=None, n_estimators=1000, learning_rate=1,
                                          algorithm='BROWNIAN', random_state=None, countdown = countdown)

    elif classifier_name == "SVM":
        """
            Parameters:
                kernel: (String) Kernel to be used  
                deposit_weight: (Integer) weight to be given to the deposits to deal with unbalanced data 
                penalty: (string) type of norm for the penalty 
                random_state:(Integer) seed for random generator, useful to obtain reproducible results 
                normalize: (Boolean) Indicates if the data needs to be normalized (True) or not (False). Notice that 
                    SVM is sensitive linear transformations  

            For more information about the model visit 
            http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
        """
        penalty = parameter_dic["penalty"].value
        kernel = str(parameter_dic["kernel"].valueAsText)
        random_state = parameter_dic["random_state"].value
        deposit_weight = parameter_dic["deposit_weight"].value
        if deposit_weight is None:
            _verbose_print("deposit_weight is None, balanced wighting will be used")
            class_weight = "balanced"
        else:
            class_weight = {1: float(deposit_weight), -1: (100-float(deposit_weight))}

        classifier = SVC(C=penalty, kernel=kernel, degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True,
                         tol=0.001, cache_size=200, class_weight=class_weight, verbose=False, max_iter=-1,
                         decision_function_shape='ovr', random_state=random_state)

    elif classifier_name == "Random Forest":
        """
            Parameters:
    
                num_estimators: (Integer) Number of trees to be trained 
                max_depth: (Integer) max depth of the trained trees 
                deposit_weight: (Integer) weight to be given to the deposits to deal with unbalanced data 
                random_state:(Integer) seed for random generator, useful to obtain reproducible results 

            For more information about the model visit 
            http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
        """
        _verbose_print("Random Forest selected")
        num_estimators = parameter_dic["num_estimators"].value
        max_depth = parameter_dic["max_depth"].value
        random_state = parameter_dic["random_state"].value
        deposit_weight = parameter_dic["deposit_weight"].value
        if deposit_weight is None:
            _verbose_print("deposit_weight is None, balanced wighting will be used")
            class_weight = "balanced"
        else:
            class_weight = {1: float(deposit_weight), -1: (100-float(deposit_weight))}

        classifier = RandomForestClassifier(n_estimators=num_estimators, criterion='gini', max_depth=max_depth, min_samples_split=2,
                               min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
                               max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=1,
                               random_state=random_state, verbose=0, warm_start=False, class_weight=class_weight)
    else:
        raise NotImplementedError("Not implemented classifier: {}".format(classifier_name))

    # Some classifiers need the data be normalized before training, this is done here
    if classifier_name in ["SVM"]:
        normalize = parameter_dic["normalize"].value
        if normalize:
            scaler = StandardScaler().fit(train_regressors)
            train_regressors = scaler.transform(train_regressors)
            MESSAGES.AddMessage("Data normalized")
            if output_model is not None:
                # Save the information of the normalize transformation
                joblib.dump(scaler, output_model.replace(".pkl", "_scale.pkl"))

    # train the model
    start = timer()
    classifier.fit(train_regressors, train_response)
    end = timer()
    MESSAGES.AddMessage("Training time: {:.3f} seconds".format(end-start))

    if output_model is not None:
        _save_model(classifier_name, classifier, output_model, train_points, train_regressors_name, train_response_name)
    else:
        _verbose_print("No output model selected")

    _print_train_results(classifier_name, classifier, train_regressors, train_response, train_regressors_name,
                         leave_one_out)

    return
end = time.clock()
print "rbf support vector machine accuracy on titanic dataset: %.2f%%" % accuracy
print "time to train rbf support vector machine: %.2f seconds\n" % (end - start)

#random forest on titanic
start = time.clock()
clf = RandomForestClassifier(100)
clf.fit(X_train,y_train)
accuracy = clf.score(X_test, y_test) * 100.0
end = time.clock()
print "random forest accuracy on titanic dataset: %.2f%%" % accuracy
print "time to train random forest: %.2f seconds\n" % (end - start)

#adaboost on titanic
start = time.clock()
clf = AdaBoostClassifier()
clf.fit(X_train,y_train)
accuracy = clf.score(X_test, y_test) * 100.0
end = time.clock()
print "adaboost accuracy on titanic dataset: %.2f%%" % accuracy
print "time to train adaboost: %.2f seconds\n" % (end - start)

#k nearest neighbors w/ euclidean distance on titanic
start = time.clock()
clf = KNeighborsClassifier(n_neighbors=5, algorithm='auto')
clf.fit(X_train,y_train)
accuracy = clf.score(X_test, y_test) * 100.0
end = time.clock()
print "euclidean k nearest neighbors accuracy on titanic dataset: %.2f%%" % accuracy
print "time to train euclidean k nearest neighbors: %.2f seconds\n" % (end - start)
Example #59
0
def third_generation(X, y, size=200, seed=None):
    mlp_parameters = list(itertools.product([1, 2, 4, 8, 32, 128],\
                                            [0, 0.2, 0.5, 0.9],
                                            [0.1, 0.3, 0.6]))
    mlp_clf = [
        MLPClassifier(hidden_layer_sizes=(h, ),
                      momentum=m,
                      learning_rate_init=a) for (h, m, a) in mlp_parameters
    ]
    mlp_name = ['mlp_{0}_{1}_{2}'.format(*param) for param in mlp_parameters]

    neigbhors_number = [int(i) for i in np.linspace(1, X.shape[0], 40)]
    weighting_methods = ['uniform', 'distance']
    knn_clf = [
        KNeighborsClassifier(n_neighbors=nn, weights=w)
        for (nn, w) in itertools.product(neigbhors_number, weighting_methods)
    ]
    knn_name = [
        'knn_{0}_{1}'.format(*param) for param in itertools.product(
            neigbhors_number, ['uniform', 'distance'])
    ]
    C = np.logspace(-3, 7, num=11)
    degree = [2, 3, 4]
    gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
    svm_clf_poly = [
        SVC(C=c, kernel='poly', degree=d)
        for (c, d) in itertools.product(C, degree)
    ]
    svm_clf_poly_name = [
        'svm_poly_{0}_{1}'.format(*param)
        for param in itertools.product(C, degree)
    ]
    svm_clf_rbf = [
        SVC(C=c, kernel='rbf', gamma=g)
        for (c, g) in itertools.product(C, gamma)
    ]
    svm_clf_rbf_name = [
        'svm_rbf_{0}_{1}'.format(*param)
        for param in itertools.product(C, gamma)
    ]

    dt_params = list(itertools.product(['gini', 'entropy'], \
                                       [1, 2, 3, 4, 5, None], \
                                       [None, 'sqrt', 'log2'], \
                                       ['best', 'random']))
    dt_clf = [
        DecisionTreeClassifier(criterion=c,
                               max_depth=d,
                               max_features=f,
                               splitter=s) for (c, d, f, s) in dt_params
    ]
    dt_name = ['dt_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params]

    et_clf = [
        ExtraTreeClassifier(criterion=c,
                            max_depth=d,
                            max_features=f,
                            splitter=s) for (c, d, f, s) in dt_params
    ]
    et_name = ['et_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params]

    ada_params = list(itertools.product([2**i for i in range(1, 14)], \
                                        [1, 2, 3]))
    ada_dt_clf = [
        AdaBoostClassifier(n_estimators=n,
                           base_estimator=DecisionTreeClassifier(max_depth=m))
        for (n, m) in ada_params
    ]
    ada_et_clf = [
        AdaBoostClassifier(n_estimators=n,
                           base_estimator=ExtraTreeClassifier(max_depth=m))
        for (n, m) in ada_params
    ]
    ada_dt_name = ['ada_dt_{0}_{1}'.format(*param) for param in ada_params]
    ada_et_name = ['ada_et_{0}_{1}'.format(*param) for param in ada_params]

    nb_bag_est = 50
    nb_bag_stumps = 200
    bag_dt = BaggingClassifier(n_estimators=nb_bag_est,
                               base_estimator=DecisionTreeClassifier())
    bag_et = BaggingClassifier(n_estimators=nb_bag_est,
                               base_estimator=ExtraTreeClassifier())
    bag_stumps = BaggingClassifier(
        n_estimators=nb_bag_stumps,
        base_estimator=DecisionTreeClassifier(max_depth=1))
    bag_dt.fit(X, y)
    bag_et.fit(X, y)
    bag_stumps.fit(X, y)
    dt_bag_clf = bag_dt.estimators_
    et_bag_clf = bag_et.estimators_
    stump_bag_clf = bag_stumps.estimators_
    dt_bag_name = ['dt_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)]
    et_bag_name = ['et_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)]
    stump_bag_name = [
        'stump_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_stumps)
    ]

    bag_dt_clf = [bag_dt]
    bag_et_clf = [bag_dt]
    bag_stump_clf = [bag_stumps]
    bag_dt_name = ['bag_dt_{0}'.format(str(nb_bag_est))]
    bag_et_name = ['bag_et_{0}'.format(str(nb_bag_est))]
    bag_stump_name = ['bag_stump_{0}'.format(str(200))]

    nb_rf = 15
    rf = RandomForestClassifier(n_estimators=nb_rf)
    rf.fit(X, y)
    dt_rf_clf = rf.estimators_
    dt_rf_name = ['dt_rf_{0}'.format(nb_est) for nb_est in range(nb_rf)]

    log_parameters = list(itertools.product(['l1', 'l2'],\
                                            np.logspace(-5, 9, num=15),
                                            [True, False]))
    log_clf = [
        LogisticRegression(penalty=l, C=c, fit_intercept=f)
        for (l, c, f) in log_parameters
    ]
    log_name = ['log_{0}_{1}_{2}'.format(*param) for param in log_parameters]

    sgd_parameters = list(
        itertools.product([
            'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron',
            'squared_loss', 'huber', 'epsilon_insensitive',
            'squared_epsilon_insensitive'
        ], ['elasticnet'], [True, False], np.arange(0, 1.1, 0.1)))
    sgd_clf = [
        SGDClassifier(loss=l, penalty=p, fit_intercept=f, l1_ratio=l1)
        for (l, p, f, l1) in sgd_parameters
    ]
    sgd_name = [
        'sgd_{0}_{1}_{2}_{3}'.format(*param) for param in sgd_parameters
    ]

    pool = mlp_clf + knn_clf + svm_clf_poly + svm_clf_rbf + dt_clf + et_clf + ada_dt_clf + ada_et_clf + \
                dt_bag_clf + et_bag_clf + stump_bag_clf + bag_dt_clf + bag_et_clf + bag_stump_clf + dt_rf_clf + \
                log_clf + sgd_clf
    pool_name = mlp_name + knn_name + svm_clf_poly_name + svm_clf_rbf_name + dt_name + et_name + ada_dt_name + \
                ada_et_name + dt_bag_name + et_bag_name + stump_bag_name + bag_dt_name + bag_et_name + \
                bag_stump_name + dt_rf_name + log_name + sgd_name

    for model in pool:
        if not check_model_is_fitted(model, X[0, :].reshape((1, -1))):
            model.fit(X, y)

    np.random.seed(seed)
    order = np.random.permutation(range(len(pool)))
    estimators = [pool[i] for i in order[:size]]

    return estimators, pool_name
### Try a variety of classifiers
# Import classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
#from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# Initialize classifiers
clf_NB = GaussianNB()
clf_DT = tree.DecisionTreeClassifier(min_samples_split=5,criterion='entropy')
#clf_SVC = SVC()
clf_KN = KNeighborsClassifier()
clf_RF = RandomForestClassifier()
clf_AB = AdaBoostClassifier()

# Leverage tester.py to fit and test the classifiers
test_classifier(clf_NB, my_dataset, features_list)
#test_classifier(clf_DT, my_dataset, features_list)
#test_classifier(clf_SVC, my_dataset, features_list)
#test_classifier(clf_KN, my_dataset, features_list)
#test_classifier(clf_RF, my_dataset, features_list)
#test_classifier(clf_AB, my_dataset, features_list)

# Apply Grid Search to fine tune the parameters
from sklearn import grid_search

# Set the parameters for my two chosen classifiers
parameters_DT = {'min_samples_split':[2,5,10,15,20],
                 'criterion':('gini','entropy')}