from sklearn.ensemble import GradientBoostingRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error from sklearn.preprocessing import MinMaxScaler from sklearn.pipeline import make_pipeline from olist.order import Order model = 'gradient_boost_2' print("Importing dataset") orders = Order().get_training_data() # Create train and target variable X = orders.drop( ['order_id', 'wait_time', 'delay_vs_expected', 'expected_wait_time'], axis=1) y = orders['wait_time'] # Training test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) # Average CV score on the training set was:-62.99467662836531 exported_pipeline = make_pipeline( MinMaxScaler(), GradientBoostingRegressor(alpha=0.99, learning_rate=0.1, loss="huber",
from sklearn.model_selection import train_test_split from tpot import TPOTRegressor #import data from olist.order import Order from olist.data import Olist data = Olist().get_data() training_orders = Order().get_training_data() orders = data['olist_orders_dataset'] orders['estimate_wait_time'] = (pd.to_datetime(orders['order_estimated_delivery_date'])\ - pd.to_datetime(orders['order_purchase_timestamp'])) / np.timedelta64(24, 'h') training_orders =\ training_orders.merge(orders[['estimate_wait_time', 'order_id']], on='order_id') X = training_orders.drop(['order_id', 'wait_time', 'delay_vs_expected'], axis=1) y = training_orders['wait_time'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py')