def Decisiontreeregressor(self): dt = DecisionTreeRegressor() dt.fit(self.x_train, self.y_train) pred = dt.predict(self.x_test) pred = pd.DataFrame(pred) x_test = self.x_test.reset_index() predictions = pd.concat([x_test, pred], axis=1) predictions['num_orders'] = predictions[0] predictions = predictions.drop([0], axis=1) print("predictions\n", predictions) if 'week' in predictions: ts_tot_pred = predictions.groupby(['week'])['num_orders'].sum() ts_tot_pred = pd.DataFrame(ts_tot_pred) else: ts_tot_pred = pd.DataFrame(predictions) ts_tot_pred = predictions.groupby(['date'])['num_orders'].sum() print("ts_tot_pred\n", ts_tot_pred) if self.is_canvas_ml == 1: self.canvas.get_tk_widget().pack_forget() fig = Figure(figsize=(5, 5), dpi=100) fig.add_subplot(111).plot(self.ts_tot_orders, color='Blue') fig.add_subplot(111).plot(ts_tot_pred, color='Red') ideaLib.py2idea(dataframe=ts_tot_pred, databaseName='ts_tot_pred_dt', client=client) self.canvas = FigureCanvasTkAgg(fig, master=self.ml) # A tk.DrawingArea. self.canvas.get_tk_widget().pack(side=RIGHT) self.canvas.draw() self.is_canvas_ml = 1
def FitDecisionTree(self, train_predictors, test_predictors, train_target, test_target, params={}): if bool(params): print("Fitting with max_depth = " + str(params["max_depth"]) + ", max_leaf_nodes = " + str(params["max_leaf_nodes"]) + ", min_samples_leaf = " + str(params["min_samples_leaf"]) + " ...") dt = DecisionTreeRegressor( random_state=42, max_depth=params["max_depth"], max_leaf_nodes=params["max_leaf_nodes"], min_samples_leaf=params["min_samples_leaf"]) else: print("Fitting with default parameters...") dt = DecisionTreeRegressor(random_state=42) dt_model = dt.fit(train_predictors, train_target.values.ravel()) dt_rmse, dt_predictions = self.evaluateModel( model=dt_model, test_predictors=test_predictors, test_target=test_target, modelName='Decision Tree') dt_paramMap = dt_model.get_params() for key in dt_paramMap.keys(): # print(key, dt_paramMap[key]) if key in ['min_samples_leaf']: min_samples_leaf = dt_paramMap[key] if key in ['max_depth']: max_depth = dt_paramMap[key] if key in ['max_leaf_nodes']: max_leaf_nodes = dt_paramMap[key] if bool(params) == False: if key in ['min_samples_leaf', 'max_depth', 'max_leaf_nodes']: print(key, dt_paramMap[key]) # print("Decision Tree Root Mean Squared Error (RMSE) on test data = %g" % dt_rmse) return [min_samples_leaf, max_depth, max_leaf_nodes, dt_rmse], dt_predictions
plt.title('LogReg Precision, Recall, and fbeta Curves') sns.despine() lr_coefs = list(zip(X.columns, logreg.coef_[0])) lr_coefs_df = pd.DataFrame(lr_coefs) lr_top_coefs = [x for x in lr_coefs if np.abs(x[1]) > .07] lr_top_coefs = sorted(lr_top_coefs, key=(lambda x: x[1]), reverse=True) lr_top_coefs_df = pd.DataFrame(lr_top_coefs) plt.barh([x[0] for x in lr_top_coefs], width=[x[1] for x in lr_top_coefs]) plt.title('LogOdds') plt.grid(b=False) sns.despine() dt = DecisionTreeClassifier(max_depth=5) dt.fit(X_train, y_train) # Calculate fbeta for decision tree all_fbeta_dt, best_fbeta_dt = fbeta(dt, X_test=X_test) # not scaled data for dt # Calculate ROC Score and AUC for decision tree fpr_dt, tpr_dt, thresholds_dt = roc_curve( y_test, dt.predict_proba(X_test)[:, 1]) # not scaled data for dt auc_dt = roc_auc_score(y_test, dt.predict_proba(X_test)[:, 1]) # This allows us to make a decision tree real fast directly in the notebook! dot_data = StringIO() export_graphviz(dt, out_file=dot_data,
# Linear Regression model lr = LinearRegression() lr.fit(X_train, y_train) y_pred = lr.predict(X_val) lr_rmse_score = np.sqrt(mean_squared_error(y_pred, y_val)) lr_r2_score = r2_score(y_pred, y_val) print("Root Mean Squared Error :", lr_rmse_score) print("R2Score :", lr_r2_score) # In[86]: # Decission tree dt = DecisionTreeRegressor() dt_model = dt.fit(X_train, y_train) y_pred_dtone = dt_model.predict(X_val) ## calculate RMSE rms_dt = np.sqrt(mean_squared_error(y_pred_dtone, y_val)) r2_dt = r2_score(y_val, y_pred_dtone) print('RMSE of Decision Tree Regression:', rms_dt) print('R-Squared value:', r2_dt) R2 = r2_score(y_val, y_pred) n = X_train.shape[0] p = len(X_train.columns) Adj_r2 = 1 - (1 - R2) * (n - 1) / (n - p - 1) print('Adjusted R-Square is : ', Adj_r2) # In[ ]:
sc = students.join(courses, "CO_CURSO") sci = sc.join(institutions, "CO_IES").drop("CO_ALUNO_SITUACAO", "CO_OCDE_AREA_GERAL", "CO_UF_IES", "CO_IES", "CO_CURSO") todas = ["EVASOR", "IN_RESERVA_ENSINO_PUBLICO", "IN_RESERVA_RENDA_FAMILIAR"] # for i in range(0,len(todas)-1): features = todas i = 0 varx = features.pop(0) assembler = VectorAssembler(inputCols=features, outputCol="features") dataFinal = assembler.transform(sci) dt = DecisionTreeClassifier(labelCol=varx, featuresCol='features', maxDepth=5) (treinamento, teste) = dataFinal.randomSplit([0.8, 0.2]) model = dt.fit(treinamento) predictions = model.transform(teste) # print model.toDebugString total = predictions.count() missed = predictions.where(str(varx) + " != prediction").count() _00 = predictions.where(varx + "=0 and prediction = 0").count() _01 = predictions.where(varx + "=0 and prediction = 1").count() _10 = predictions.where(varx + "=1 and prediction = 0").count() _11 = predictions.where(varx + "=1 and prediction = 1").count() print sys.argv[1] print "-----\n" print total, "Erradas: ", missed, "Erro(%): ", float(missed) / float( total) * 100 print "0\t", _00, "\t|\t", _01 print "1\t", _10, "\t|\t", _11