def determined_train_and_predict(train_datas, train_lables, test_ids, test_datas):
    class_fier = AdaBoostClassifier(RandomForestClassifier(n_estimators=300), algorithm="SAMME", n_estimators=400)
#     class_fier = RandomForestClassifier(n_estimators=300)
    class_fier.fit(train_datas, train_lables)
    
    predict_lables = class_fier.predict(test_datas)
    result_dic = {}
    result_dic['Id'] = test_ids
    result_dic['Response'] = predict_lables
    out_file_content = pd.DataFrame(result_dic)
    out_file_content.to_csv('sample3.csv', index=False)
Exemple #2
0
def AB(pth):
     train_desc=np.load(pth+'/training_features.npy')
     nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0)
     idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')

# Scaling the words
     stdSlr = StandardScaler().fit(train_desc)
     train_desc = stdSlr.transform(train_desc)
     modelAB=AdaBoostClassifier(n_estimators=100)
     modelAB.fit(train_desc,np.array(train_labels))

     joblib.dump((modelAB, img_classes, stdSlr), pth+"/ab-bof.pkl", compress=3) 
     test(pth, "ab-")
Exemple #3
0
def classify(X,y,cv):
    #clf = DecisionTreeClassifier(criterion='entropy',min_samples_split=10,random_state=5)
    #clf = RandomForestClassifier(n_estimators=1000)
    clf = AdaBoostClassifier()
    #clf = ExtraTreesClassifier()

    score = cross_val_score(clf, X, y, cv=cv)
    print '%s-fold cross validation accuracy: %s' % (cv,sum(score)/score.shape[0])
    clf = clf.fit(X,y)

    #print 'Feature Importances'
    #print clf.feature_importances_
    #X = clf.transform(X,threshold=.3)
    
    preds = clf.predict(X)
    print 'predictions counter'
    print Counter(clf.predict(X))
    fp=0
    tp=0
    fn=0
    tn=0
    for a in range(len(y)):
        if y[a]==preds[a]:
            if preds[a]==0:
                tn+=1
            elif preds[a]==1:
                tp+=1
        elif preds[a]==1:fp+=1
        elif preds[a]==0:fn+=1
    
    print 'correct positives:', tp
    print 'correct negatives:', tn
    print 'false positives:', fp
    print 'false negatives:', fn
    print 'precision:',float(tp)/(tp+fp)
    print 'recall (tp)/(tp+fn):',float(tp)/(tp+fn)
    print 'false positive rate (fp)/(fp+tn):', float(fp)/(fp+tn)
    print 'false positive rate2 (fp)/(fp+tp):', float(fp)/(fp+tp)
    print 'prediction accuracy: %s%s\n' % (100*float(tp+tn)/(tp+tn+fp+fn),'%') 
    return clf
l_train = l_train.join(pd.get_dummies(l_train['Transmission']))
l_train = l_train.join(pd.get_dummies(l_train['WheelType']))
l_train = l_train.join(pd.get_dummies(l_train['Size']))


l_train = l_train.drop(['Auction','Transmission','WheelType','Size'],axis=1)
l_train = l_train.dropna()

data = l_train.drop('IsBadBuy',axis=1)
target = l_train['IsBadBuy']
x_train, x_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=.3)


# AdaBoost Runs the best

model = AdaBoostClassifier()
clf = model.fit(x_train, y_train)
scores = clf.score(x_train,y_train)

print metrics.classification_report(y_train, clf.predict(x_train))
print metrics.classification_report(y_test, clf.predict(x_test))
y_pred = clf.predict(x_test)

metrics.roc_auc_score(y_train,clf.predict(x_train))
metrics.roc_auc_score(y_test,clf.predict(x_test))

# Create a submission
#submission = pd.DataFrame({ 'RefId' : l_test.RefId, 'prediction' : y_pred })
#submission.to_csv('/users/alexandersedgwick/desktop/submission.csv')

etclf.fit(x_train, y_train)

# Print Confusion Matrix
metrics.confusion_matrix(etclf.predict(x_test), y_test)


from sklearn.ensemble.forest import RandomForestClassifier

rdclf = RandomForestClassifier(n_estimators=20, max_depth=10)
rdclf.fit(x_train, y_train)
metrics.confusion_matrix(rdclf.predict(x_test), y_test)


from sklearn.ensemble.weight_boosting import AdaBoostClassifier

adaclf = AdaBoostClassifier(n_estimators=20)
adaclf.fit(x_train, y_train)
metrics.confusion_matrix(adaclf.predict(x_test), y_test)




metrics.confusion_matrix(etclf.predict(x_test), y_test)
metrics.confusion_matrix(rdclf.predict(x_test), y_test)
metrics.confusion_matrix(adaclf.predict(x_test), y_test)


#The base random forest model seems to do best here.


import time
Exemple #6
0
def init_model(input_data, target_data):
    model = AdaBoostClassifier(n_estimators=285, learning_rate=0.19, algorithm='SAMME.R')
    model.fit(input_data, target_data)    
    return model