def predictResult(x_train, y_train, y_test, x_test): data2 = pd.read_csv("/tmp/predict_result.csv", header=0) # vamos percorrer o arquivo com o valor a ser testado, onde vamos pegar as colunas e jogar os valores numa array cols2 = data2.columns[(data2.columns != columnResultName)] fts2 = data2[cols2] fts2 = Normalizer().fit_transform(fts2) randomForest.fit(x_train, y_train) dump(randomForest, 'randomForest.model') randomForestLoaded = load('randomForest.model') prFit = randomForestLoaded.predict(x_test) print("predicao:", prFit) print("Matriz de Confusao LR:") print(cfm(y_test, prFit)) print("F1 score LR:") print(f1s(y_test, prFit)) print("Precision score LR:") print(ps(y_test, prFit)) print("Recall score LR:") print(rs(y_test, prFit)) print("Classification Report") print(cr(y_test, prFit)) pr1 = randomForestLoaded.predict(fts2) print("predico unica", pr1) return pr1
def logistic_model_using_pca_plus_prediction(self): self.clf = LR(random_state=29, tol=0.000000000001) self.data4 = self.df1.drop(["y"], axis=1) Y = self.df1["y"] scaler = StandardScaler() self.data4 = scaler.fit_transform(self.data4) pca = PCA(n_components=100 ) # 100 by optimal number of principal components needed x = pca.fit_transform(self.data4) x_train1, x_test1, y_train1, y_test1 = tts(x, Y, test_size=0.2, stratify=Y, random_state=29) self.model = self.clf.fit(x_train1, y_train1) probs = self.model.predict_proba(x_test1) prob1 = self.model.predict_proba(x_train1) #y_pred=self.model.predict(x_test1) preds = probs[:, 1] self.fpr, self.tpr, self.threshold = roc_curve(y_train1, prob1[:, 1]) optimal_idx = np.argmax(self.tpr - self.fpr) self.optimal = self.threshold[optimal_idx] self.out1 = pd.DataFrame({"y_true": y_test1, "y_pred": preds}) self.out1["predicted_class"] = self.out1["y_pred"].apply( self.class_value) print(rs(self.out1["y_true"], self.out1["predicted_class"])) print(ase(self.out1["y_true"], self.out1["predicted_class"])) print(auc_score(y_test1, preds))
def predictResult(betterN, x_train, y_train, y_test, x_test): data2 = pd.read_csv("/tmp/predict_result.csv", header=0) # vamos percorrer o arquivo com o valor a ser testado, onde vamos pegar as colunas e jogar os valores numa array cols2 = data2.columns[(data2.columns != columnResultName)] fts2 = np.array(data2[cols2]) #quando nao mandar um vaor de betterN, significa que demos o load do modelo if betterN > 0: knn.n_neighbors = betterN knn.fit(x_train, y_train) # dump(knn, 'models/knn_teste.joblib') prFit = knn.predict(x_test) print("predicao: a", prFit) print("Matriz de Confusao NB:") print(cfm(y_test, prFit)) print("F1 score NB:") print(f1s(y_test, prFit)) print("Precision score NB:") print(ps(y_test, prFit)) print("Recall score NB:") print(rs(y_test, prFit)) print("Classification Report") print(cr(y_test, prFit)) pr1 = knn.predict(fts2) print("predico unica", int(pr1[0])) print("predicao unica score") print(pr1) return pr1
def predictResult(x_train, y_train, y_test, x_test): data2 = pd.read_csv("/tmp/predict_result.csv", header=0) # vamos percorrer o arquivo com o valor a ser testado, onde vamos pegar as colunas e jogar os valores numa array cols2 = data2.columns[(data2.columns != columnResultName)] fts2 = data2[cols2] fts2 = Normalizer().fit_transform(fts2) scores = cross_val_score(logisticR, x_train, y_train, n_jobs=30) print("scores cross val") print(scores) logisticR.fit(x_train, y_train) dump(logisticR, 'logistic.model') logisticLoaded = load('logistic.model') prFit = logisticLoaded.predict(x_test) print("predicao:", prFit) print("Matriz de Confusao LR:") print(cfm(y_test, prFit)) print("F1 score LR:") print(f1s(y_test, prFit)) print("Precision score LR:") print(ps(y_test, prFit)) print("Recall score LR:") print(rs(y_test, prFit)) print("Classification Report") print(cr(y_test, prFit)) print("Accuracy score") print(asc(y_test, prFit)) class_names = [0, 1] # name of classes fig, ax = plt.subplots() tick_marks = np.arange(len(class_names)) plt.xticks(tick_marks, class_names) plt.yticks(tick_marks, class_names) # create heatmap sns.heatmap(pd.DataFrame(cfm(y_test, prFit)), annot=True, cmap="YlGnBu", fmt='g') ax.xaxis.set_label_position("top") plt.tight_layout() plt.title('Confusion matrix', y=1.1) plt.ylabel('Actual label') plt.xlabel('Predicted label') plt.show() y_pred_proba = logisticLoaded.predict_proba(x_test)[::, 1] fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba) auc = metrics.roc_auc_score(y_test, y_pred_proba) plt.plot(fpr, tpr, label="data 1, auc=" + str(auc)) plt.legend(loc=4) plt.show() pr1 = logisticLoaded.predict(fts2) print("predico unica", pr1) return pr1
def keras_nn_model(self): from numpy.random import seed seed(1) from tensorflow import set_random_seed set_random_seed(2) self.model1 = Sequential() self.model1.add(Dense(256, input_dim=428, activation='relu')) self.model1.add(Dense(128, activation='relu')) self.model1.add(Dense(64, activation='relu')) self.model1.add(Dense(4, activation='relu')) self.model1.add(Dense(1, activation='sigmoid')) scaler = StandardScaler() scaled = scaler.fit(self.x_train) self.X_train = scaled.fit_transform(self.x_train) self.X_test = scaled.fit_transform(self.x_test) adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) self.model1.compile(optimizer=adam, loss=keras.losses.binary_crossentropy) self.model1.fit(self.X_train, self.y_train, epochs=1, batch_size=256, class_weight={ 0: 1, 1: 8 }) pred = self.model1.predict(self.X_test) pred1 = self.model1.predict(self.X_train) print(auc_score(self.y_test, pred)) self.fpr, self.tpr, self.threshold = roc_curve(self.y_train, pred1.ravel()) optimal_idx = np.argmax(self.tpr - self.fpr) self.optimal = self.threshold[optimal_idx] # for recall calculation self.out = pd.DataFrame({ "y_true": self.y_test, "y_pred": pred.ravel() }) self.out["predicted_class"] = self.out["y_pred"].apply( self.class_value) print(rs(self.out["y_true"], self.out["predicted_class"])) print(ase(self.out["y_true"], self.out["predicted_class"]))
def prediction_on_test(self): self.probs = self.model.predict_proba(self.x_test) #self.y_pred=self.model.predict(self.x_test) self.preds = self.probs[:, 1] self.prob1 = self.model.predict_proba(self.x_train) #print(ase(self.y_test,self.y_pred)) self.fpr, self.tpr, self.threshold = roc_curve(self.y_train, self.prob1[:, 1]) optimal_idx = np.argmax(self.tpr - self.fpr) self.optimal = self.threshold[optimal_idx] self.out1 = pd.DataFrame({"y_true": self.y_test, "y_pred": self.preds}) self.out1["predicted_class"] = self.out1["y_pred"].apply( self.class_value) print(rs(self.out1["y_true"], self.out1["predicted_class"])) print(ase(self.out1["y_true"], self.out1["predicted_class"])) #print(rs(self.y_test,self.y_pred)) print(auc_score(self.y_test, self.preds))
# Model-6 Lasso Regression print("Model-6 Lasso Regression") tuned_params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]} model = GridSearchCV(Lasso(), tuned_params, scoring = 'neg_mean_absolute_error', cv=20, n_jobs=-1) model.fit(X_train, y_train) print("model.best_estimator",model.best_estimator_) # Predict Train results y_train_pred = model.predict(X_train) # Predict Test results y_pred = model.predict(X_test) print("Train Results for Lasso Regression:") print("*******************************") print("Root mean squared error: ", sqrt(mse(y_train.values, y_train_pred))) print("R-squared: ", rs(y_train.values, y_train_pred)) print("Mean Absolute Error: ", mae(y_train.values, y_train_pred)) # Feature Importance print("Feature Importance") ## Building the model again with the best hyperparameters model = Lasso(alpha=1000) model.fit(X_train, y_train) indices = np.argsort(-abs(model.coef_)) print("The features in order of importance are:") print(50*'-') for feature in X.columns[indices]: print(feature) print("#################################################################")
trainINSTAGRAM = np.array(train[['CO2EMISSIONS']]) rgr.fit(trainTIKTOK, trainINSTAGRAM) #.fit applies model to data print(rgr.coef_) print(rgr.intercept_) def future(data1, intercept, slope): return data1 * slope + intercept enginesizevar1 = 3.3 #liters ee = future(enginesizevar1, rgr.intercept_[0], rgr.coef_[0][0]) print(ee) #check for acccuracy from sklearn.metrics import r2_score as rs testTIKTOK = np.array(testdata[['ENGINESIZE']]) testINSTAGRAM = np.array(testdata[['CO2EMISSIONS']]) ra = rgr.predict(testTIKTOK) absolute = np.mean(np.absolute(ra - testINSTAGRAM)) r2score = rs(ra, testINSTAGRAM) print(absolute) print(r2score)
trad = death[:b] td = death[b:] tranc = ncase[:b] tnc = ncase[b:] trand = ndeath[:b] tnd = ndeath[b:] trawc = wcase[:b] twc = wcase[b:] trawd = wdeath[:b] twd = wdeath[b:] trawnc = wncase[:b] twnc = wncase[b:] trawnd = wndeath[:b] twnd = wndeath[b:] modelc = n.poly1d(n.polyfit(trax,trac,3)) conc = rs(tc,modelc(tx)) modeld = n.poly1d(n.polyfit(trax,trad,3)) cond = rs(td,modeld(tx)) modelnc = n.poly1d(n.polyfit(trax,tranc,3)) connc = rs(tnc,modelnc(tx)) modelnd = n.poly1d(n.polyfit(trax,trand,3)) connd = rs(tnd,modelnd(tx)) modelwc = n.poly1d(n.polyfit(trax,trawc,3)) conwc = rs(twc,modelwc(tx)) modelwd = n.poly1d(n.polyfit(trax,trawd,3)) conwd = rs(twd,modelwd(tx)) modelwnc = n.poly1d(n.polyfit(trax,trawnc,3)) conwnc = rs(twnc,modelwnc(tx)) modelwnd = n.poly1d(n.polyfit(trax,trawnd,3)) conwnd = rs(twnd,modelwnd(tx)) mc = n.poly1d(n.polyfit(x,case,3))
#make predictions y_pred_rfc = rfc.predict(X_test) y_pred_lr = lr.predict(X_test) y_pred_knn = knn.predict(X_test) # get the metrics accs_rfc.append(acc(y_pred_rfc, y_test)) accs_lr.append(acc(y_pred_lr, y_test)) accs_knn.append(acc(y_pred_knn, y_test)) ps_rfc.append(ps(y_pred_rfc, y_test)) ps_lr.append(ps(y_pred_lr, y_test)) ps_knn.append(ps(y_pred_knn, y_test)) rs_rfc.append(rs(y_pred_rfc, y_test)) rs_lr.append(rs(y_pred_lr, y_test)) rs_knn.append(rs(y_pred_knn, y_test)) print(i) #============================== # examine performances of all models """ Note - can see that across all metrics, logistic regression performs best """ # accuracy plt.figure(figsize=(12, 6)) plt.grid() sns.distplot(accs_rfc, hist=False, kde_kws={"shade": True}, label='RFC')
#Extracting the data set into X and Y values X = data[['gender','age','fever','dry cough','difficulty in breathing','tiredness','soar_throat','nasal_congestion','diff_symptoms']] Y = data['result'] #Spliting data set into training and testing X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=0) # TRaining the model rf= RandomForestClassifier(n_estimators=50, random_state=1) rf.fit(X,Y) #rf.fit(X_train,np.array(Y_train).reshape(Y_train.shape[0],1)) #predicting the values pred = np.array(rf.predict(X_test)) recall = rs(Y_test,pred) precision = ps(Y_test,pred) f1 = fs(Y_test,pred) ma = rf.score(X_test,Y_test) #Printing All score print('*** Evaluation metrics for test dataset ***\n') print('Recall Score: ',recall) print('Precision Score: ',precision) print('F1 Score: ',f1) print('Accuracy: ',ma) a = pd.DataFrame(Y_test) a['pred']= rf.predict(X_test) print('\n\tTable 3\n') print(a.head())
knn3.fit(x_train, y_train) print "Accuracy Training KNN:", knn3.score(x_train, y_train) predictions = knn3.predict(x_test) accuracy = metrics.accuracy_score(y_test, predictions) print "Accuracy Test KNN:", accuracy print "Matriz de Confusao KNN:" print cfm(y_test, predictions) print "F1 score KNN:" print f1s(y_test, predictions) print "Precision score KNN:" print ps(y_test, predictions) print "Recall score KNN:" print rs(y_test, predictions) #svm kernel linear svm = svm.SVC(kernel='linear', C=1.0) svm.fit(x_train, y_train) predictionsSvm = svm.predict(x_test) accuracySvm = metrics.accuracy_score(predictionsSvm, y_test) print "SVM LINEAR Accuracy Test:", accuracySvm print "Matriz de Confusao SVM LINEAR:" print cfm(y_test, predictionsSvm) print "F1 score SVM LINEAR:"