Esempio n. 1
0
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline

from olist.order import Order

model = 'gradient_boost_2'

print("Importing dataset")
orders = Order().get_training_data()

# Create train and target variable
X = orders.drop(
    ['order_id', 'wait_time', 'delay_vs_expected', 'expected_wait_time'],
    axis=1)
y = orders['wait_time']

# Training test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=42)

# Average CV score on the training set was:-62.99467662836531
exported_pipeline = make_pipeline(
    MinMaxScaler(),
    GradientBoostingRegressor(alpha=0.99,
                              learning_rate=0.1,
                              loss="huber",
Esempio n. 2
0
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor

#import data
from olist.order import Order
from olist.data import Olist

data = Olist().get_data()
training_orders = Order().get_training_data()

orders = data['olist_orders_dataset']

orders['estimate_wait_time'] = (pd.to_datetime(orders['order_estimated_delivery_date'])\
    - pd.to_datetime(orders['order_purchase_timestamp'])) / np.timedelta64(24, 'h')

training_orders =\
    training_orders.merge(orders[['estimate_wait_time', 'order_id']], on='order_id')

X = training_orders.drop(['order_id', 'wait_time', 'delay_vs_expected'],
                         axis=1)
y = training_orders['wait_time']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_boston_pipeline.py')