Exemple #1
0
class AdaBoostClassifierImpl():
    def __init__(self,
                 base_estimator=None,
                 n_estimators=50,
                 learning_rate=1.0,
                 algorithm='SAMME.R',
                 random_state=None):
        self._hyperparams = {
            'base_estimator': base_estimator,
            'n_estimators': n_estimators,
            'learning_rate': learning_rate,
            'algorithm': algorithm,
            'random_state': random_state
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
Exemple #2
0
class AdaBoostClassifierImpl():

    def __init__(self, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None):
        if isinstance(base_estimator, lale.operators.Operator):
            if isinstance(base_estimator, lale.operators.IndividualOp):
                base_estimator = base_estimator._impl_instance()._wrapped_model
            else:
                raise ValueError("If base_estimator is a Lale operator, it needs to be an individual operator. ")
        self._hyperparams = {
            'base_estimator': base_estimator,
            'n_estimators': n_estimators,
            'learning_rate': learning_rate,
            'algorithm': algorithm,
            'random_state': random_state}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
def determined_train_and_predict(train_datas, train_lables, test_ids, test_datas):
    class_fier = AdaBoostClassifier(RandomForestClassifier(n_estimators=300), algorithm="SAMME", n_estimators=400)
#     class_fier = RandomForestClassifier(n_estimators=300)
    class_fier.fit(train_datas, train_lables)
    
    predict_lables = class_fier.predict(test_datas)
    result_dic = {}
    result_dic['Id'] = test_ids
    result_dic['Response'] = predict_lables
    out_file_content = pd.DataFrame(result_dic)
    out_file_content.to_csv('sample3.csv', index=False)
Exemple #4
0
def classify(X,y,cv):
    #clf = DecisionTreeClassifier(criterion='entropy',min_samples_split=10,random_state=5)
    #clf = RandomForestClassifier(n_estimators=1000)
    clf = AdaBoostClassifier()
    #clf = ExtraTreesClassifier()

    score = cross_val_score(clf, X, y, cv=cv)
    print '%s-fold cross validation accuracy: %s' % (cv,sum(score)/score.shape[0])
    clf = clf.fit(X,y)

    #print 'Feature Importances'
    #print clf.feature_importances_
    #X = clf.transform(X,threshold=.3)
    
    preds = clf.predict(X)
    print 'predictions counter'
    print Counter(clf.predict(X))
    fp=0
    tp=0
    fn=0
    tn=0
    for a in range(len(y)):
        if y[a]==preds[a]:
            if preds[a]==0:
                tn+=1
            elif preds[a]==1:
                tp+=1
        elif preds[a]==1:fp+=1
        elif preds[a]==0:fn+=1
    
    print 'correct positives:', tp
    print 'correct negatives:', tn
    print 'false positives:', fp
    print 'false negatives:', fn
    print 'precision:',float(tp)/(tp+fp)
    print 'recall (tp)/(tp+fn):',float(tp)/(tp+fn)
    print 'false positive rate (fp)/(fp+tn):', float(fp)/(fp+tn)
    print 'false positive rate2 (fp)/(fp+tp):', float(fp)/(fp+tp)
    print 'prediction accuracy: %s%s\n' % (100*float(tp+tn)/(tp+tn+fp+fn),'%') 
    return clf
                       columns=X_train.columns)

#:# model

params = {'learning_rate': 0.5, 'n_estimators': 300}

classifier = AdaBoostClassifier(**params)
classifier.fit(X_train, y_train)

#:# hash
#:# e595f5d5683f3e3692608020cd5bde18
md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest()
print(f'md5: {md5}')

#:# audit
y_pred = classifier.predict(transform_pipeline.transform(X_test))
y_pred_proba = classifier.predict_proba(
    transform_pipeline.transform(X_test))[:, 1]

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f'acc: {accuracy_score(y_test, y_pred)}')
print(f'auc: {roc_auc_score(y_test, y_pred_proba)}')
print(f'precision: {precision_score(y_test, y_pred)}')
print(f'recall: {recall_score(y_test, y_pred)}')
print(f'specificity: {tn/(tn+fp)}')
print(f'f1: {f1_score(y_test, y_pred)}')

#:# session info

# Dodaj wersję pythona w session info
def result():
    if request.method == 'POST':
        path = request.files.get('myFile')

        df = pd.read_csv(path, encoding="ISO-8859-1")

        filename = request.form['filename']

        str1 = request.form['feature']
        str2 = request.form['label']

        if str1 in list(df) and str2 in list(df):
            y = df[str2]
            X = df[str1]
        else:
            return render_template('nameError.html')

        x = []
        for subject in X:
            result = re.sub(r"http\S+", "", subject)
            replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result)
            x.append(replaced)
        X = pd.Series(x)

        X = X.str.lower()
        """
        texts = []
        for doc in X:
            doc = nlp(doc, disable=['parser', 'ner'])
            tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
            tokens = [tok for tok in tokens if tok not in stopwords]
            tokens = ' '.join(tokens)
            texts.append(tokens)

        X = pd.Series(texts)
        """
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33)

        tfidfvect = TfidfVectorizer(ngram_range=(1, 1))
        X_train_tfidf = tfidfvect.fit_transform(X_train)

        start = time()
        clf1 = LinearSVC()
        clf1.fit(X_train_tfidf, y_train)
        pred_SVC = clf1.predict(tfidfvect.transform(X_test))

        a1 = accuracy_score(y_test, pred_SVC)
        end = time()
        print("accuracy SVC: {} and time: {} s".format(a1, (end - start)))

        start = time()
        clf2 = LogisticRegression(n_jobs=-1,
                                  multi_class='multinomial',
                                  solver='newton-cg')
        clf2.fit(X_train_tfidf, y_train)
        pred_LR = clf2.predict(tfidfvect.transform(X_test))
        a2 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LR: {} and time: {}".format(a2, (end - start)))

        start = time()
        clf3 = RandomForestClassifier(n_jobs=-1)

        clf3.fit(X_train_tfidf, y_train)
        pred = clf3.predict(tfidfvect.transform(X_test))
        a3 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RFC: {} and time: {}".format(a3, (end - start)))

        start = time()
        clf4 = MultinomialNB()

        clf4.fit(X_train_tfidf, y_train)
        pred = clf4.predict(tfidfvect.transform(X_test))
        a4 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy MNB: {} and time: {}".format(a4, (end - start)))

        start = time()
        clf5 = GaussianNB()

        clf5.fit(X_train_tfidf.toarray(), y_train)
        pred = clf5.predict(tfidfvect.transform(X_test).toarray())
        a5 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy GNB: {} and time: {}".format(a5, (end - start)))

        start = time()
        clf6 = LogisticRegressionCV(n_jobs=-1)
        clf6.fit(X_train_tfidf, y_train)
        pred_LR = clf6.predict(tfidfvect.transform(X_test))
        a6 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LRCV: {} and time: {}".format(a6, (end - start)))

        start = time()
        clf7 = AdaBoostClassifier()
        clf7.fit(X_train_tfidf, y_train)
        pred_LR = clf7.predict(tfidfvect.transform(X_test))
        a7 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy ABC: {} and time: {}".format(a7, (end - start)))

        start = time()
        clf8 = BernoulliNB()

        clf8.fit(X_train_tfidf.toarray(), y_train)
        pred = clf8.predict(tfidfvect.transform(X_test).toarray())
        a8 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy BNB: {} and time: {}".format(a8, (end - start)))

        start = time()
        clf9 = Perceptron(n_jobs=-1)

        clf9.fit(X_train_tfidf.toarray(), y_train)
        pred = clf9.predict(tfidfvect.transform(X_test).toarray())
        a9 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy Per: {} and time: {}".format(a9, (end - start)))
        start = time()
        clf10 = RidgeClassifierCV()

        clf10.fit(X_train_tfidf.toarray(), y_train)
        pred = clf10.predict(tfidfvect.transform(X_test).toarray())
        a10 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RidCV: {} and time: {}".format(a10, (end - start)))

        start = time()
        clf11 = SGDClassifier(n_jobs=-1)

        clf11.fit(X_train_tfidf.toarray(), y_train)
        pred = clf11.predict(tfidfvect.transform(X_test).toarray())
        a11 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy SGDC: {} and time: {}".format(a11, (end - start)))
        start = time()
        clf12 = SGDClassifier(n_jobs=-1)

        clf12.fit(X_train_tfidf.toarray(), y_train)
        pred = clf12.predict(tfidfvect.transform(X_test).toarray())
        a12 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy XGBC: {} and time: {}".format(a12, (end - start)))

        acu_list = [a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12]
        max_list = max(acu_list)

        if max_list == a1:
            pickle.dump(clf1, open(filename + '_model', 'wb'))
        elif max_list == a2:
            pickle.dump(clf2, open(filename + '_model', 'wb'))
        elif max_list == a3:
            pickle.dump(clf3, open(filename + '_model', 'wb'))
        elif max_list == a4:
            pickle.dump(clf4, open(filename + '_model', 'wb'))
        elif max_list == a5:
            pickle.dump(clf5, open(filename + '_model', 'wb'))
        elif max_list == a6:
            pickle.dump(clf6, open(filename + '_model', 'wb'))
        elif max_list == a7:
            pickle.dump(clf7, open(filename + '_model', 'wb'))
        elif max_list == a8:
            pickle.dump(clf8, open(filename + '_model', 'wb'))
        elif max_list == a9:
            pickle.dump(clf9, open(filename + '_model', 'wb'))
        elif max_list == a10:
            pickle.dump(clf10, open(filename + '_model', 'wb'))
        elif max_list == a11:
            pickle.dump(clf11, open(filename + '_model', 'wb'))
        elif max_list == a12:
            pickle.dump(clf12, open(filename + '_model', 'wb'))

        pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb'))

        return render_template("result.html",
                               ac1=a1,
                               ac2=a2,
                               ac3=a3,
                               ac4=a4,
                               ac5=a5,
                               ac6=a6,
                               ac7=a7,
                               ac8=a8,
                               ac9=a9,
                               ac10=a10,
                               ac11=a11,
                               ac12=a12)
# Print Confusion Matrix
metrics.confusion_matrix(etclf.predict(x_test), y_test)


from sklearn.ensemble.forest import RandomForestClassifier

rdclf = RandomForestClassifier(n_estimators=20, max_depth=10)
rdclf.fit(x_train, y_train)
metrics.confusion_matrix(rdclf.predict(x_test), y_test)


from sklearn.ensemble.weight_boosting import AdaBoostClassifier

adaclf = AdaBoostClassifier(n_estimators=20)
adaclf.fit(x_train, y_train)
metrics.confusion_matrix(adaclf.predict(x_test), y_test)




metrics.confusion_matrix(etclf.predict(x_test), y_test)
metrics.confusion_matrix(rdclf.predict(x_test), y_test)
metrics.confusion_matrix(adaclf.predict(x_test), y_test)


#The base random forest model seems to do best here.


import time