def remove_before_changepoint(data: pd.HDFStore, select_idx: pd.Index = None): changepoints = { 837: '2014-03-16', 700: '2014-01-03', 681: '2013-06-14', 986: '2013-05-22', 885: '2014-05-18', 589: '2013-05-27', 105: '2013-05-20', 663: '2013-10-06', 764: '2013-04-24', 364: '2013-05-31', 969: '2013-03-10', 803: '2014-01-07', 91: '2014-01-14' } # noinspection PyUnusedLocal for store, date in changepoints.items(): idx = data.select_as_coordinates( 'train', 'Store != store or Date > pd.Timestamp(date)') if select_idx is not None: select_idx = select_idx.intersection(idx) else: select_idx = idx return select_idx
def glm_predictions(data: pd.HDFStore, output: pd.HDFStore, model_save_dir=None, predict_train=True, from_saved_model=False): # +- test_set_stores = data.select_column('test', 'Store').unique() ## if from_saved_model: if from_saved_model is True: glm = get_saved_glm_model(model_save_dir) else: glm = get_saved_glm_model(from_saved_model) else: ## logger.info("Dropping store data before changepoint.") select_idx = remove_before_changepoint(data, None) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.info("Dropping stores not in test set. Initial shape") idx = data.select_as_coordinates('train', 'Store in test_set_stores') select_idx = select_idx.intersection(idx) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.debug("Log transform on sales data") idx = data.select_as_coordinates('train', 'Sales > 0') select_idx = select_idx.intersection(idx) with warnings_to_log('divide by zero'): data.put('train_logsales', np.log(data.select('train', 'columns = Sales')), data_columns=True) logger.info("Reduced to {0}".format(len(select_idx))) ## select_idx = remove_outliers_lm(data, select_idx, log_lm_features, test_set_stores) logger.info("Removed outliers, reduced shape {0}".format( len(select_idx))) ## logger.info("Running glm training") X = DataFromHDF(data_store=data, key='train', select_idx=select_idx, columns=linear_features) y = DataFromHDF(data_store=data, key='train_logsales', select_idx=select_idx, column='Sales') glm = GLMPredictions(stores=test_set_stores, steps=15, step_by=3) glm.fit(X, y) ## if model_save_dir: glm.save_model(model_save_dir) ## logger.info("glm predictions on test set") X = DataFromHDF(data_store=data, key='test', columns=linear_features) glm_output = DataFromHDF(data_store=output, key='test/glm', data_columns=True) preds = glm.predict(X) glm_output.put(preds) ## if predict_train: logger.info("glm predictions on training set") X = DataFromHDF(data_store=data, key='train', columns=linear_features) glm_output = DataFromHDF(data_store=output, key='train/glm', data_columns=True) preds = glm.predict(X) glm_output.put(preds)
def xgb_predictions(data: pd.HDFStore, output: pd.HDFStore, model_save_dir=None, predict_train=True, from_saved_model=False): # +- ## # noinspection PyUnusedLocal test_set_stores = data.select_column('test', 'Store').unique() if from_saved_model: if from_saved_model is True: xgb = get_saved_xgb_model(model_save_dir) else: xgb = get_saved_xgb_model(from_saved_model) else: logger.info("Dropping store data before changepoint.") select_idx = remove_before_changepoint(data, None) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.info("Dropping stores not in test set. Initial shape") idx = data.select_as_coordinates('train', 'Store in test_set_stores') select_idx = select_idx.intersection(idx) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.debug("Log transform on sales data") idx = data.select_as_coordinates('train', 'Sales > 0') select_idx = select_idx.intersection(idx) with warnings_to_log('divide by zero'): data.put('train_logsales', np.log(data.select('train', 'columns = Sales')), data_columns=True) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.info("Running xgboost training") X = DataFromHDF(data_store=data, key='train', select_idx=select_idx, columns=xgb_features) y = DataFromHDF(data_store=data, key='train_logsales', select_idx=select_idx, column='Sales') xgb = XGBPredictions(eval_function=xgb_expm1_rmspe, params=xparams, nrounds=3000) xgb.fit(X, y) ## if model_save_dir: xgb.save_model(model_save_dir) ## logger.info("xgboost predictions on test set") X = DataFromHDF(data_store=data, key='test', columns=xgb_features) xgb_output = DataFromHDF(data_store=output, key='test/xgb', data_columns=True) preds = xgb.predict(X) xgb_output.put(preds) ## if predict_train: logger.info("xgboost predictions on training set") xgb_output = DataFromHDF(data_store=output, key='train/xgb', data_columns=True) select_idx = data.select_as_coordinates('train', 'Store in test_set_stores') X = DataFromHDF(data_store=data, key='train', select_idx=select_idx, columns=xgb_features) predict_in_chunks(xgb, X, xgb_output)