def xgb_predictions(data: pd.HDFStore, output: pd.HDFStore, model_save_dir=None, predict_train=True, from_saved_model=False): # +- ## # noinspection PyUnusedLocal test_set_stores = data.select_column('test', 'Store').unique() if from_saved_model: if from_saved_model is True: xgb = get_saved_xgb_model(model_save_dir) else: xgb = get_saved_xgb_model(from_saved_model) else: logger.info("Dropping store data before changepoint.") select_idx = remove_before_changepoint(data, None) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.info("Dropping stores not in test set. Initial shape") idx = data.select_as_coordinates('train', 'Store in test_set_stores') select_idx = select_idx.intersection(idx) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.debug("Log transform on sales data") idx = data.select_as_coordinates('train', 'Sales > 0') select_idx = select_idx.intersection(idx) with warnings_to_log('divide by zero'): data.put('train_logsales', np.log(data.select('train', 'columns = Sales')), data_columns=True) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.info("Running xgboost training") X = DataFromHDF(data_store=data, key='train', select_idx=select_idx, columns=xgb_features) y = DataFromHDF(data_store=data, key='train_logsales', select_idx=select_idx, column='Sales') xgb = XGBPredictions(eval_function=xgb_expm1_rmspe, params=xparams, nrounds=3000) xgb.fit(X, y) ## if model_save_dir: xgb.save_model(model_save_dir) ## logger.info("xgboost predictions on test set") X = DataFromHDF(data_store=data, key='test', columns=xgb_features) xgb_output = DataFromHDF(data_store=output, key='test/xgb', data_columns=True) preds = xgb.predict(X) xgb_output.put(preds) ## if predict_train: logger.info("xgboost predictions on training set") xgb_output = DataFromHDF(data_store=output, key='train/xgb', data_columns=True) select_idx = data.select_as_coordinates('train', 'Store in test_set_stores') X = DataFromHDF(data_store=data, key='train', select_idx=select_idx, columns=xgb_features) predict_in_chunks(xgb, X, xgb_output)
def glm_predictions(data: pd.HDFStore, output: pd.HDFStore, model_save_dir=None, predict_train=True, from_saved_model=False): # +- test_set_stores = data.select_column('test', 'Store').unique() ## if from_saved_model: if from_saved_model is True: glm = get_saved_glm_model(model_save_dir) else: glm = get_saved_glm_model(from_saved_model) else: ## logger.info("Dropping store data before changepoint.") select_idx = remove_before_changepoint(data, None) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.info("Dropping stores not in test set. Initial shape") idx = data.select_as_coordinates('train', 'Store in test_set_stores') select_idx = select_idx.intersection(idx) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.debug("Log transform on sales data") idx = data.select_as_coordinates('train', 'Sales > 0') select_idx = select_idx.intersection(idx) with warnings_to_log('divide by zero'): data.put('train_logsales', np.log(data.select('train', 'columns = Sales')), data_columns=True) logger.info("Reduced to {0}".format(len(select_idx))) ## select_idx = remove_outliers_lm(data, select_idx, log_lm_features, test_set_stores) logger.info("Removed outliers, reduced shape {0}".format( len(select_idx))) ## logger.info("Running glm training") X = DataFromHDF(data_store=data, key='train', select_idx=select_idx, columns=linear_features) y = DataFromHDF(data_store=data, key='train_logsales', select_idx=select_idx, column='Sales') glm = GLMPredictions(stores=test_set_stores, steps=15, step_by=3) glm.fit(X, y) ## if model_save_dir: glm.save_model(model_save_dir) ## logger.info("glm predictions on test set") X = DataFromHDF(data_store=data, key='test', columns=linear_features) glm_output = DataFromHDF(data_store=output, key='test/glm', data_columns=True) preds = glm.predict(X) glm_output.put(preds) ## if predict_train: logger.info("glm predictions on training set") X = DataFromHDF(data_store=data, key='train', columns=linear_features) glm_output = DataFromHDF(data_store=output, key='train/glm', data_columns=True) preds = glm.predict(X) glm_output.put(preds)
store.put('df', df, data_columns=True, format='table') print df store.close() # store['df'] # load it # # Read hdf5 by chunks # https://towardsdatascience.com/why-and-how-to-use-pandas-with-large-data-9594dda2ea4c # https://stackoverflow.com/questions/40348945/reading-data-by-chunking-with-hdf5-and-pandas rd_store = HDFStore(fn) # # df = pd.DataFrame(columns=columns) # chunksize = 4096 # # %%timeit # # for chunk in pd.read_hdf(fn, 'df', chunksize=chunksize, where='h_m < 5.3'): # # df = pd.concat([df, chunk], ignore_index=True) # # # sel by time # # https://stackoverflow.com/questions/25681308/pandas-read-hdf-query-by-date-and-time-range # # Может лучше не таблицей хранить если выбирать по времени c = rd_store.select_column('df', 'timeticket') where = pd.DatetimeIndex(c).indexer_between_time('12:00', '16:56') # resp = rd_store.select('df', where=where) # print resp.info() print resp # Another selection # https://stackoverflow.com/questions/20502996/use-or-in-hdfstore-select-pandas