コード例 #1
0
def xgb_predictions(data: pd.HDFStore,
                    output: pd.HDFStore,
                    model_save_dir=None,
                    predict_train=True,
                    from_saved_model=False):
    # +-
    ##
    # noinspection PyUnusedLocal
    test_set_stores = data.select_column('test', 'Store').unique()

    if from_saved_model:
        if from_saved_model is True:
            xgb = get_saved_xgb_model(model_save_dir)
        else:
            xgb = get_saved_xgb_model(from_saved_model)

    else:

        logger.info("Dropping store data before changepoint.")
        select_idx = remove_before_changepoint(data, None)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.info("Dropping stores not in test set. Initial shape")
        idx = data.select_as_coordinates('train', 'Store in test_set_stores')
        select_idx = select_idx.intersection(idx)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.debug("Log transform on sales data")
        idx = data.select_as_coordinates('train', 'Sales > 0')
        select_idx = select_idx.intersection(idx)
        with warnings_to_log('divide by zero'):
            data.put('train_logsales',
                     np.log(data.select('train', 'columns = Sales')),
                     data_columns=True)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.info("Running xgboost training")
        X = DataFromHDF(data_store=data,
                        key='train',
                        select_idx=select_idx,
                        columns=xgb_features)
        y = DataFromHDF(data_store=data,
                        key='train_logsales',
                        select_idx=select_idx,
                        column='Sales')
        xgb = XGBPredictions(eval_function=xgb_expm1_rmspe,
                             params=xparams,
                             nrounds=3000)
        xgb.fit(X, y)

        ##
        if model_save_dir:
            xgb.save_model(model_save_dir)

    ##
    logger.info("xgboost predictions on test set")
    X = DataFromHDF(data_store=data, key='test', columns=xgb_features)
    xgb_output = DataFromHDF(data_store=output,
                             key='test/xgb',
                             data_columns=True)
    preds = xgb.predict(X)
    xgb_output.put(preds)

    ##
    if predict_train:
        logger.info("xgboost predictions on training set")
        xgb_output = DataFromHDF(data_store=output,
                                 key='train/xgb',
                                 data_columns=True)
        select_idx = data.select_as_coordinates('train',
                                                'Store in test_set_stores')
        X = DataFromHDF(data_store=data,
                        key='train',
                        select_idx=select_idx,
                        columns=xgb_features)
        predict_in_chunks(xgb, X, xgb_output)
コード例 #2
0
def glm_predictions(data: pd.HDFStore,
                    output: pd.HDFStore,
                    model_save_dir=None,
                    predict_train=True,
                    from_saved_model=False):
    # +-
    test_set_stores = data.select_column('test', 'Store').unique()
    ##
    if from_saved_model:
        if from_saved_model is True:
            glm = get_saved_glm_model(model_save_dir)
        else:
            glm = get_saved_glm_model(from_saved_model)

    else:

        ##
        logger.info("Dropping store data before changepoint.")
        select_idx = remove_before_changepoint(data, None)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.info("Dropping stores not in test set. Initial shape")
        idx = data.select_as_coordinates('train', 'Store in test_set_stores')
        select_idx = select_idx.intersection(idx)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.debug("Log transform on sales data")
        idx = data.select_as_coordinates('train', 'Sales > 0')
        select_idx = select_idx.intersection(idx)
        with warnings_to_log('divide by zero'):
            data.put('train_logsales',
                     np.log(data.select('train', 'columns = Sales')),
                     data_columns=True)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        select_idx = remove_outliers_lm(data, select_idx, log_lm_features,
                                        test_set_stores)
        logger.info("Removed outliers, reduced shape {0}".format(
            len(select_idx)))

        ##
        logger.info("Running glm training")
        X = DataFromHDF(data_store=data,
                        key='train',
                        select_idx=select_idx,
                        columns=linear_features)
        y = DataFromHDF(data_store=data,
                        key='train_logsales',
                        select_idx=select_idx,
                        column='Sales')
        glm = GLMPredictions(stores=test_set_stores, steps=15, step_by=3)
        glm.fit(X, y)

        ##
        if model_save_dir:
            glm.save_model(model_save_dir)

    ##
    logger.info("glm predictions on test set")
    X = DataFromHDF(data_store=data, key='test', columns=linear_features)
    glm_output = DataFromHDF(data_store=output,
                             key='test/glm',
                             data_columns=True)
    preds = glm.predict(X)
    glm_output.put(preds)

    ##
    if predict_train:
        logger.info("glm predictions on training set")
        X = DataFromHDF(data_store=data, key='train', columns=linear_features)
        glm_output = DataFromHDF(data_store=output,
                                 key='train/glm',
                                 data_columns=True)
        preds = glm.predict(X)
        glm_output.put(preds)
コード例 #3
0
ファイル: mem_disk_storage.py プロジェクト: blockspacer/ml-cv
    store.put('df', df, data_columns=True, format='table')
    print df

    store.close()
    # store['df']  # load it
    #
    # Read hdf5 by chunks
    # https://towardsdatascience.com/why-and-how-to-use-pandas-with-large-data-9594dda2ea4c
    # https://stackoverflow.com/questions/40348945/reading-data-by-chunking-with-hdf5-and-pandas
    rd_store = HDFStore(fn)
    #
    # df = pd.DataFrame(columns=columns)
    # chunksize = 4096
    # # %%timeit
    # # for chunk in pd.read_hdf(fn, 'df', chunksize=chunksize, where='h_m < 5.3'):
    # #     df = pd.concat([df, chunk], ignore_index=True)
    #
    # # sel by time
    # # https://stackoverflow.com/questions/25681308/pandas-read-hdf-query-by-date-and-time-range
    # # Может лучше не таблицей хранить если выбирать по времени
    c = rd_store.select_column('df', 'timeticket')

    where = pd.DatetimeIndex(c).indexer_between_time('12:00', '16:56')
    #
    resp = rd_store.select('df', where=where)
    # print resp.info()
    print resp

    # Another selection
    # https://stackoverflow.com/questions/20502996/use-or-in-hdfstore-select-pandas