class AdaBoostClassifierImpl(): def __init__(self, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None): self._hyperparams = { 'base_estimator': base_estimator, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'algorithm': algorithm, 'random_state': random_state } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
class AdaBoostClassifierImpl(): def __init__(self, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None): if isinstance(base_estimator, lale.operators.Operator): if isinstance(base_estimator, lale.operators.IndividualOp): base_estimator = base_estimator._impl_instance()._wrapped_model else: raise ValueError("If base_estimator is a Lale operator, it needs to be an individual operator. ") self._hyperparams = { 'base_estimator': base_estimator, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'algorithm': algorithm, 'random_state': random_state} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def determined_train_and_predict(train_datas, train_lables, test_ids, test_datas): class_fier = AdaBoostClassifier(RandomForestClassifier(n_estimators=300), algorithm="SAMME", n_estimators=400) # class_fier = RandomForestClassifier(n_estimators=300) class_fier.fit(train_datas, train_lables) predict_lables = class_fier.predict(test_datas) result_dic = {} result_dic['Id'] = test_ids result_dic['Response'] = predict_lables out_file_content = pd.DataFrame(result_dic) out_file_content.to_csv('sample3.csv', index=False)
def classify(X,y,cv): #clf = DecisionTreeClassifier(criterion='entropy',min_samples_split=10,random_state=5) #clf = RandomForestClassifier(n_estimators=1000) clf = AdaBoostClassifier() #clf = ExtraTreesClassifier() score = cross_val_score(clf, X, y, cv=cv) print '%s-fold cross validation accuracy: %s' % (cv,sum(score)/score.shape[0]) clf = clf.fit(X,y) #print 'Feature Importances' #print clf.feature_importances_ #X = clf.transform(X,threshold=.3) preds = clf.predict(X) print 'predictions counter' print Counter(clf.predict(X)) fp=0 tp=0 fn=0 tn=0 for a in range(len(y)): if y[a]==preds[a]: if preds[a]==0: tn+=1 elif preds[a]==1: tp+=1 elif preds[a]==1:fp+=1 elif preds[a]==0:fn+=1 print 'correct positives:', tp print 'correct negatives:', tn print 'false positives:', fp print 'false negatives:', fn print 'precision:',float(tp)/(tp+fp) print 'recall (tp)/(tp+fn):',float(tp)/(tp+fn) print 'false positive rate (fp)/(fp+tn):', float(fp)/(fp+tn) print 'false positive rate2 (fp)/(fp+tp):', float(fp)/(fp+tp) print 'prediction accuracy: %s%s\n' % (100*float(tp+tn)/(tp+tn+fp+fn),'%') return clf
columns=X_train.columns) #:# model params = {'learning_rate': 0.5, 'n_estimators': 300} classifier = AdaBoostClassifier(**params) classifier.fit(X_train, y_train) #:# hash #:# e595f5d5683f3e3692608020cd5bde18 md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest() print(f'md5: {md5}') #:# audit y_pred = classifier.predict(transform_pipeline.transform(X_test)) y_pred_proba = classifier.predict_proba( transform_pipeline.transform(X_test))[:, 1] tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print(f'acc: {accuracy_score(y_test, y_pred)}') print(f'auc: {roc_auc_score(y_test, y_pred_proba)}') print(f'precision: {precision_score(y_test, y_pred)}') print(f'recall: {recall_score(y_test, y_pred)}') print(f'specificity: {tn/(tn+fp)}') print(f'f1: {f1_score(y_test, y_pred)}') #:# session info # Dodaj wersjÄ™ pythona w session info
def result(): if request.method == 'POST': path = request.files.get('myFile') df = pd.read_csv(path, encoding="ISO-8859-1") filename = request.form['filename'] str1 = request.form['feature'] str2 = request.form['label'] if str1 in list(df) and str2 in list(df): y = df[str2] X = df[str1] else: return render_template('nameError.html') x = [] for subject in X: result = re.sub(r"http\S+", "", subject) replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result) x.append(replaced) X = pd.Series(x) X = X.str.lower() """ texts = [] for doc in X: doc = nlp(doc, disable=['parser', 'ner']) tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-'] tokens = [tok for tok in tokens if tok not in stopwords] tokens = ' '.join(tokens) texts.append(tokens) X = pd.Series(texts) """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) tfidfvect = TfidfVectorizer(ngram_range=(1, 1)) X_train_tfidf = tfidfvect.fit_transform(X_train) start = time() clf1 = LinearSVC() clf1.fit(X_train_tfidf, y_train) pred_SVC = clf1.predict(tfidfvect.transform(X_test)) a1 = accuracy_score(y_test, pred_SVC) end = time() print("accuracy SVC: {} and time: {} s".format(a1, (end - start))) start = time() clf2 = LogisticRegression(n_jobs=-1, multi_class='multinomial', solver='newton-cg') clf2.fit(X_train_tfidf, y_train) pred_LR = clf2.predict(tfidfvect.transform(X_test)) a2 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LR: {} and time: {}".format(a2, (end - start))) start = time() clf3 = RandomForestClassifier(n_jobs=-1) clf3.fit(X_train_tfidf, y_train) pred = clf3.predict(tfidfvect.transform(X_test)) a3 = accuracy_score(y_test, pred) end = time() print("accuracy RFC: {} and time: {}".format(a3, (end - start))) start = time() clf4 = MultinomialNB() clf4.fit(X_train_tfidf, y_train) pred = clf4.predict(tfidfvect.transform(X_test)) a4 = accuracy_score(y_test, pred) end = time() print("accuracy MNB: {} and time: {}".format(a4, (end - start))) start = time() clf5 = GaussianNB() clf5.fit(X_train_tfidf.toarray(), y_train) pred = clf5.predict(tfidfvect.transform(X_test).toarray()) a5 = accuracy_score(y_test, pred) end = time() print("accuracy GNB: {} and time: {}".format(a5, (end - start))) start = time() clf6 = LogisticRegressionCV(n_jobs=-1) clf6.fit(X_train_tfidf, y_train) pred_LR = clf6.predict(tfidfvect.transform(X_test)) a6 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LRCV: {} and time: {}".format(a6, (end - start))) start = time() clf7 = AdaBoostClassifier() clf7.fit(X_train_tfidf, y_train) pred_LR = clf7.predict(tfidfvect.transform(X_test)) a7 = accuracy_score(y_test, pred_LR) end = time() print("accuracy ABC: {} and time: {}".format(a7, (end - start))) start = time() clf8 = BernoulliNB() clf8.fit(X_train_tfidf.toarray(), y_train) pred = clf8.predict(tfidfvect.transform(X_test).toarray()) a8 = accuracy_score(y_test, pred) end = time() print("accuracy BNB: {} and time: {}".format(a8, (end - start))) start = time() clf9 = Perceptron(n_jobs=-1) clf9.fit(X_train_tfidf.toarray(), y_train) pred = clf9.predict(tfidfvect.transform(X_test).toarray()) a9 = accuracy_score(y_test, pred) end = time() print("accuracy Per: {} and time: {}".format(a9, (end - start))) start = time() clf10 = RidgeClassifierCV() clf10.fit(X_train_tfidf.toarray(), y_train) pred = clf10.predict(tfidfvect.transform(X_test).toarray()) a10 = accuracy_score(y_test, pred) end = time() print("accuracy RidCV: {} and time: {}".format(a10, (end - start))) start = time() clf11 = SGDClassifier(n_jobs=-1) clf11.fit(X_train_tfidf.toarray(), y_train) pred = clf11.predict(tfidfvect.transform(X_test).toarray()) a11 = accuracy_score(y_test, pred) end = time() print("accuracy SGDC: {} and time: {}".format(a11, (end - start))) start = time() clf12 = SGDClassifier(n_jobs=-1) clf12.fit(X_train_tfidf.toarray(), y_train) pred = clf12.predict(tfidfvect.transform(X_test).toarray()) a12 = accuracy_score(y_test, pred) end = time() print("accuracy XGBC: {} and time: {}".format(a12, (end - start))) acu_list = [a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12] max_list = max(acu_list) if max_list == a1: pickle.dump(clf1, open(filename + '_model', 'wb')) elif max_list == a2: pickle.dump(clf2, open(filename + '_model', 'wb')) elif max_list == a3: pickle.dump(clf3, open(filename + '_model', 'wb')) elif max_list == a4: pickle.dump(clf4, open(filename + '_model', 'wb')) elif max_list == a5: pickle.dump(clf5, open(filename + '_model', 'wb')) elif max_list == a6: pickle.dump(clf6, open(filename + '_model', 'wb')) elif max_list == a7: pickle.dump(clf7, open(filename + '_model', 'wb')) elif max_list == a8: pickle.dump(clf8, open(filename + '_model', 'wb')) elif max_list == a9: pickle.dump(clf9, open(filename + '_model', 'wb')) elif max_list == a10: pickle.dump(clf10, open(filename + '_model', 'wb')) elif max_list == a11: pickle.dump(clf11, open(filename + '_model', 'wb')) elif max_list == a12: pickle.dump(clf12, open(filename + '_model', 'wb')) pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb')) return render_template("result.html", ac1=a1, ac2=a2, ac3=a3, ac4=a4, ac5=a5, ac6=a6, ac7=a7, ac8=a8, ac9=a9, ac10=a10, ac11=a11, ac12=a12)
# Print Confusion Matrix metrics.confusion_matrix(etclf.predict(x_test), y_test) from sklearn.ensemble.forest import RandomForestClassifier rdclf = RandomForestClassifier(n_estimators=20, max_depth=10) rdclf.fit(x_train, y_train) metrics.confusion_matrix(rdclf.predict(x_test), y_test) from sklearn.ensemble.weight_boosting import AdaBoostClassifier adaclf = AdaBoostClassifier(n_estimators=20) adaclf.fit(x_train, y_train) metrics.confusion_matrix(adaclf.predict(x_test), y_test) metrics.confusion_matrix(etclf.predict(x_test), y_test) metrics.confusion_matrix(rdclf.predict(x_test), y_test) metrics.confusion_matrix(adaclf.predict(x_test), y_test) #The base random forest model seems to do best here. import time