def classify():
    linearSVM= LinearSVC( random_state=666, class_weight="balanced", max_iter=5000,  C=2.0,tol=0.001, dual=True )
    linearSVM_SVC= SVC( C=1, kernel="rbf", tol=1, random_state=0,gamma=1 )
    logistic = LogisticRegression( fit_intercept=True,class_weight="balanced", n_jobs=-1, C=1.0,
                                   max_iter=200 )
    rand_forest = RandomForestClassifier( n_estimators=403, random_state=666, max_depth=73, n_jobs=-1 )
    bc=BaggingClassifier( base_estimator=logistic, n_estimators=403, n_jobs=-1, random_state=666,
                                                max_features=410)

    ensemble_voting=VotingClassifier([("logistic",logistic),("rand_forest",rand_forest),("sgdc",SGDClassifier())],weights=[1,1,2])
    boost = AdaBoostClassifier(base_estimator=logistic)
    xgboost= XGBoostClassifier( n_estimators=103, seed=666, max_depth=4, objective="multi:softmax" )
    return ensemble_voting
Beispiel #2
0
 def fit(self, X, y, **fit_params):
     result = XGBClassifierImpl(self.max_depth, self.learning_rate, self.n_estimators, 
             self.verbosity, self.objective, self.booster, self.n_jobs, 
             self.nthread, self.gamma, self.min_child_weight, self.max_delta_step, self.subsample, 
             self.colsample_bytree, self.colsample_bylevel, self.colsample_bynode, self.reg_alpha, 
             self.reg_lambda, self.scale_pos_weight, self.base_score, self.random_state, 
             self.seed, self.missing)
     result._xgboost_model = XGBoostClassifier(
                 **self.get_params())
     if fit_params is None:
         result._xgboost_model.fit(X, y)
     else:
         result._xgboost_model.fit(X, y, **fit_params)
     return result
Beispiel #3
0
rf_acc = cv(RandomForestClassifier(n_estimators=403,n_jobs=-1, random_state=seed),data_model.drop(columns='label',axis=1), data_model['label'])


# ## Experiment 3: Added Feature + XGBoost

# In[55]:


from xgboost import XGBClassifier as XGBoostClassifier


# In[56]:


X_train, X_test, y_train, y_test = train_test_split(data_model.drop(columns='label',axis=1),data_model['label'] , test_size=0.3)
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, XGBoostClassifier(seed=seed))


# ## Experiment 4: Added Feature + Naive Bayes

# In[57]:


X_train, X_test, y_train, y_test = train_test_split(data_model.drop(columns='label',axis=1),data_model['label'] , test_size=0.3)
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, BernoulliNB())


# ## Test Data

# In[58]:
Beispiel #4
0
def best_fit(X_train, y_train):
    log("")

    seed = 666
    import time as ttt
    attributes = len(X_train.columns)
    examples = len(X_train)
    now = time()
    log(ttt.ctime())
    # Parameters for SVM
    # parameters = {
    #     "dual": [True, False],
    #     "tol": [1e-3, 1e-4, 1e-5],
    #     "C": [1.0, 1.5, 2.0, 5.0, 10, 100, 1000]
    # }
    # rand_search = RandomizedSearchCV(LinearSVC(max_iter=5000), param_distributions=parameters, cv=8,n_jobs=-1,n_iter=20)
    #
    #
    # rand_search.fit(X_train,y_train)
    # report(rand_search.cv_results_, 10)
    # log(ttt.ctime())
    # log(time() - now)
    # return

    # Parameters for Bagging
    # parameters = {
    #     "n_estimators": [2, 3, 5, 13, 51, 201, 303, 403, 505],
    #     "max_features": list(map(lambda x: int(x),
    #                              [sqrt(attributes), 2 * sqrt(attributes), 3 * sqrt(attributes), attributes / 2,
    #                               attributes / 3, attributes / 4]))
    # }
    #
    # rand_search = RandomizedSearchCV(BaggingClassifier(
    #     base_estimator=LinearSVC(random_state=seed, class_weight="balanced", max_iter=5000, C=1.0, tol=0.0001, dual=True),
    #     random_state=seed, n_jobs=1), param_distributions=parameters, n_jobs=-1, n_iter=3, cv=8,
    #     scoring=make_scorer(f1_score, average="micro", labels=["positive", "negative", "neutral"]))
    #
    # now = time()
    # log(ttt.ctime())
    # rand_search.fit(X_train, y_train)
    #
    # report(rand_search.cv_results_, 10)
    log(ttt.ctime())
    log(time() - now)

    # Parameters for RF
    # log("RF:")
    # parameters = {
    #     "n_estimators":[103, 201, 305, 403, 666, 1001, 5007, 10001],
    #     "max_depth":[None, 5, 20, 40, 73, 100, 1000, 2000],
    #     "criterion":["gini", "entropy"]
    # }
    #
    # rand_search = RandomizedSearchCV(RandomForestClassifier(random_state=seed,n_jobs=-1),param_distributions=parameters,
    #                                  n_iter=15,scoring="accuracy",
    #                                  n_jobs=1,cv=10)
    # now = time()
    # log(ttt.ctime())
    # rand_search.fit(X_train, y_train)
    #
    # report(rand_search.cv_results_, 10)
    # log(ttt.ctime())
    # log(time() - now)

    # Parameters for XGBoost
    log("XGB:")
    parameters = {
        "n_estimators": [103, 201, 403],
        "max_depth": [3, 10, 15],
        "objective": ["multi:softmax", "binary:logistic"],
        "learning_rate": [0.05, 0.1, 0.15, 0.3]
    }

    rand_search = RandomizedSearchCV(XGBoostClassifier(seed=seed),
                                     param_distributions=parameters,
                                     n_iter=5,
                                     scoring="accuracy",
                                     n_jobs=-1,
                                     cv=8)

    now = time()
    log(ttt.ctime())
    rand_search.fit(X_train, y_train)

    report(rand_search.cv_results_, 10)
    log(ttt.ctime())
    log(time() - now)

    parameters = {
        "n_estimators": [403, 666, 1000],
        "max_depth": [40, 50, 90, 100, 200],
        "subsample": [1.0, 0.6, 0.9],
        "objective": ["multi:softmax", "binary:logistic"],
        "learning_rate": [0.1, 0.15, 0.5]
    }

    rand_search = RandomizedSearchCV(XGBoostClassifier(seed=seed, ),
                                     param_distributions=parameters,
                                     n_iter=5,
                                     scoring="accuracy",
                                     n_jobs=-1,
                                     cv=8)

    now = time()
    log(ttt.ctime())
    rand_search.fit(X_train, y_train)

    report(rand_search.cv_results_, 10)
    log(ttt.ctime())
    log(time() - now)

    return

    # Parameters for VotingClassifier
    # parameters = {
    #     "weights": [
    #         [1, 1, 1],
    #         [2, 1, 1],
    #         [2, 2, 1],
    #         [4, 1, 5],
    #         [1, 1, 2],
    #         [5, 1, 2],
    #         [5, 2, 1],
    #         [5, 3, 2],
    #         [6, 2, 1],
    #         [6, 1, 5],
    #         [6, 1, 2],
    #         [7, 1, 6],
    #         [7, 2, 3],
    #     ]
    # }
    log("Voting RF XGB NB:")
    parameters = {
        "weights": [[1, 1, 1], [2, 1, 1], [1, 1, 2], [4, 1, 5], [3, 1, 3],
                    [3, 1, 4]]
    }

    rand_search = GridSearchCV(VotingClassifier(
        [("randomforest",
          RandomForestClassifier(
              n_estimators=403, random_state=seed, max_depth=73, n_jobs=-1)),
         ("naivebayes", BernoulliNB()),
         ("xgboost",
          XGBoostClassifier(n_estimators=103,
                            seed=seed,
                            max_depth=3,
                            objective="multi:softmax"))],
        voting="soft",
        n_jobs=1),
                               scoring="accuracy",
                               n_jobs=-1,
                               cv=8,
                               param_grid=parameters)
    rand_search.fit(X_train, y_train)
    #
    report(rand_search.cv_results_, 10)
    log(ttt.ctime())
    log(time() - now)
Beispiel #5
0
from xgboost import XGBClassifier as XGBoostClassifier

X_train, X_test, y_train, y_test = train_test_split(data_model.drop(
    columns='label', axis=1),
                                                    data_model['label'],
                                                    test_size=0.3)
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test,
                                                  y_test,
                                                  XGBoostClassifier(seed=seed))
                              C=1.0,
                              max_iter=200)
rand_forest = RandomForestClassifier(n_estimators=403,
                                     random_state=666,
                                     max_depth=73,
                                     n_jobs=-1)
bc = BaggingClassifier(base_estimator=logistic,
                       n_estimators=403,
                       n_jobs=-1,
                       random_state=666,
                       max_features=410)

ensemble_voting = VotingClassifier([("svm", linearSVM_SVC),
                                    ("logistic", logistic),
                                    ("rand_forest", rand_forest),
                                    ("sgdc", SGDClassifier())],
                                   weights=[1, 2, 1, 1])
boost = AdaBoostClassifier(base_estimator=logistic)
xgboost = XGBoostClassifier(n_estimators=103,
                            seed=666,
                            max_depth=4,
                            objective="multi:softmax")
X_train, X_test, Y_train, Y_test = train_test_split(left, right, test_size=0.1)

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf-svm', ensemble_voting)])
text_clf = text_clf.fit(X_train, Y_train)
predicted = text_clf.predict(X_test)
print classification_report(Y_test, predicted)
    cdTest = dataCleanup()
    cdTest.init(data_dir + 'tweets.csv', isTestingSet=True)
    cdTest.buildFeatures()
    cdTest.cleanData()
    tTest = cdTest.processedData
    stTest = stemAndTokenizeData()
    tTest = stTest.tokenize(tTest)
    tTest = stTest.stem(tTest)
    bwTest = buildWordList()
    bwTest.buildWordListFunction(tTest)
    bowTest = bagOfWords()
    bowTest.buildDataModel(tTest, bwTest.wordList, uW2V, isTestingSet=True)
    dataModelTest = bowTest.dataModel

    print("Testing Model built!")
    xgboost = XGBoostClassifier(seed=seed,
                                n_estimators=403,
                                max_depth=10,
                                objective="binary:logistic",
                                learning_rate=0.15)
    xgboost.fit(dataModel.iloc[:, 1:], dataModel.iloc[:, 0])

    print("Training Finished!")

    predictions = xgboost.predict(dataModelTest.iloc[:, 1:])
    results = pd.DataFrame([], columns=["Id", "Category"])
    results["Id"] = dataModelTest["original_id"].astype("int64")
    results["Category"] = predictions
    results.to_csv("results.csv", index=False)
    print("Results have been saved to file!!")
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBoostClassifier
from sklearn.metrics import accuracy_score
data = pd.read_csv('Loan payments data.csv')
print(data.head())
X = data.iloc['Principal','terms','age','education','Gender']
Y=data['loan_status']
gender_label = LabelEncoder()
gender_label.fit(X.Gender)
X['Gender_Labelled']=gender_label.transform(X.Gender)
Ed_Label = LabelEncoder()
Ed_Label.fit(X.education)
X['Ed_Labelled'] = Ed_Label.transform(X.education)
Y_label = LabelEncoder
Y_label.fit(Y)
Y_labelled = Y_label.transform(Y)
X_Labelled = X.iloc('Principal','terms','age','Ed_Labelled','Gender_Labelled')
x_train,x_test,y_train,y_test = train_test_split(X_Labelled,Y_labelled)
model = XGBoostClassifier()
model.fit(x_train,y_train)
y_predict = model.predict(x_test)
print(accuracy_score(y_test,y_predict))
Beispiel #9
0
def best_fit(X_train, y_train):
    log("")

    seed = 666
    import time as ttt
    attributes = len(X_train.columns)
    examples = len(X_train)
    now = time()
    log(ttt.ctime())

    log(ttt.ctime())
    log(time() - now)

    log("XGB:")
    parameters = {
        "n_estimators": [103, 201, 403],
        "max_depth": [3, 10, 15],
        "objective": ["multi:softmax", "binary:logistic"],
        "learning_rate": [0.05, 0.1, 0.15, 0.3]
    }

    rand_search = RandomizedSearchCV(XGBoostClassifier(seed=seed),
                                     param_distributions=parameters,
                                     n_iter=5,
                                     scoring="accuracy",
                                     n_jobs=-1,
                                     cv=8)

    now = time()
    log(ttt.ctime())
    rand_search.fit(X_train, y_train)

    report(rand_search.cv_results_, 10)
    log(ttt.ctime())
    log(time() - now)

    parameters = {
        "n_estimators": [403, 666, 1000],
        "max_depth": [40, 50, 90, 100, 200],
        "subsample": [1.0, 0.6, 0.9],
        "objective": ["multi:softmax", "binary:logistic"],
        "learning_rate": [0.1, 0.15, 0.5]
    }

    rand_search = RandomizedSearchCV(XGBoostClassifier(seed=seed, ),
                                     param_distributions=parameters,
                                     n_iter=5,
                                     scoring="accuracy",
                                     n_jobs=-1,
                                     cv=8)

    now = time()
    log(ttt.ctime())
    rand_search.fit(X_train, y_train)

    report(rand_search.cv_results_, 10)
    log(ttt.ctime())
    log(time() - now)

    return

    log("Voting RF XGB NB:")
    parameters = {
        "weights": [[1, 1, 1], [2, 1, 1], [1, 1, 2], [4, 1, 5], [3, 1, 3],
                    [3, 1, 4]]
    }

    rand_search = GridSearchCV(VotingClassifier(
        [("randomforest",
          RandomForestClassifier(
              n_estimators=403, random_state=seed, max_depth=73, n_jobs=-1)),
         ("naivebayes", BernoulliNB()),
         ("xgboost",
          XGBoostClassifier(n_estimators=103,
                            seed=seed,
                            max_depth=3,
                            objective="multi:softmax"))],
        voting="soft",
        n_jobs=1),
                               scoring="accuracy",
                               n_jobs=-1,
                               cv=8,
                               param_grid=parameters)
    rand_search.fit(X_train, y_train)
    #
    report(rand_search.cv_results_, 10)
    log(ttt.ctime())
    log(time() - now)