Beispiel #1
0
def save_data(timelines, filename, logger, features=None):
    """
    split data
    :param timelines:
    :param filename:
    :param logger:
    :param features:
    :return:
    """
    data = load_pickle(filename)
    if features is not None:
        save_features = timelines.loc[:, features]
    else:
        save_features = timelines
    if data is not None:
        if data.time.astype(int).max() < save_features.time.astype(int).min():
            data = data.append(save_features, sort=False)
            logger.info('Successfully appended new data to %s!' % filename)
        else:
            logger.info(
                '%s: Targeted Run-Time already in Collection-Interval!' %
                filename)
            del data
            return False
    else:
        data = save_features
    isSave = save_pickle(data, filename)
    del data
    return isSave
Beispiel #2
0
def fetch_data(start_time,
               end_time,
               table,
               topic,
               columns=None,
               record_path=None,
               meta=None,
               save_file_prefix=""):
    df = load_pickle(save_file_prefix + topic)
    if df is None:
        df = collect_batch_data(start_time=start_time,
                                end_time=end_time,
                                table=table,
                                topic=topic,
                                columns=columns,
                                record_path=record_path,
                                meta=meta)
        save_pickle(df, save_file_prefix + topic)
    else:
        if len(df) == 0:
            df = collect_batch_data(start_time=start_time,
                                    end_time=end_time,
                                    table=table,
                                    topic=topic,
                                    columns=columns,
                                    record_path=record_path,
                                    meta=meta)
            save_pickle(df, save_file_prefix + topic)
    logger.info('Fetch %s (Count): %d' % (topic, len(df)))
    return df
Beispiel #3
0
def trim_outdated(logger, run_time, pickle_name):
    pickled = load_pickle(pickle_name)
    if pickled is None: return
    interval_begin = run_time.shift(
        days=-ConfManage.getInt("COLLECTION_INTERVAL"))
    pickled = pickled.loc[pickled['order_time'] > interval_begin.ceil('day')]
    if save_pickle(pickled, pickle_name):
        logger.info('Successfully Trimmed outdated! [ {} - {} ]'.format(
            interval_begin.shift(days=1).floor('day').format(loggable),
            run_time.format(loggable)))
Beispiel #4
0
def fetch_data(start_time, end_time, table, topic, columns):
    df = load_pickle("stream_" + topic)
    if df is None:
        df = collect_batch_data(start_time=start_time,
                                end_time=end_time,
                                table=table,
                                topic=topic,
                                columns=columns)
        save_pickle(df, "stream_" + topic)
    return df
Beispiel #5
0
def preprocess(date, pickle, estimator, predict_target, holdout, mode,
               shift_days):
    data = load_pickle(pickle)
    try:
        run_time = get_run_time(date)
        logger.info('Run-Time: %s' % run_time.format(loggable))
        run_time = run_time.shift(days=shift_days).ceil('day').ceil(
            'hour').ceil('minute').ceil('second')
        start_time = run_time.shift(
            days=-ConfManage.getInt("TRAINING_INTERVAL")).floor('day').floor(
                'hour').floor('minute').floor('second')
        logger.info('Targeted Training Interval %d [%s - %s]' % \
                    (ConfManage.getInt("TRAINING_INTERVAL"), start_time.format(loggable), run_time.format(loggable)))
        logger.info('Preprocessing with Estimator %s (%s)' % (estimator, mode))
        # 导入eta类:
        module_tmp = importlib.import_module('tools.eta.{}_{}'.format(
            estimator, predict_target))
        class_tmp = getattr(
            module_tmp, '{}{}'.format(estimator.capitalize(),
                                      predict_target.capitalize()))
        estimator_obj = class_tmp()

        # 数据处理
        data = estimator_obj.etl(data)
        # 去除异常值
        data = estimator_obj.filter_data(data)
        if data is not None and 'time' in data.columns:
            # 选取某段时间数据
            data = data.loc[(data.order_time > start_time)
                            & (data.order_time < run_time)]
            order_times = data.order_time
            interval_count = len(
                order_times.apply(
                    lambda order_time: order_time.date()).unique())
            logger.info('Available Training Interval %d/%d [%s - %s]' % (interval_count, ConfManage.getInt("TRAINING_INTERVAL"), \
                         order_times.min().format(loggable), order_times.max().format(loggable)))
            # 模型训练:
            estimator_obj.preprocess(data, mode, holdout)
            Logger.resource_checkpoint('post-preprocess')
        else:
            raise Exception(
                "Data not yet obtained. Please run `python collect.py` first!")
    except (AttributeError, ValueError) as err:
        logger.error(err)
        logger.error('Trace: {}'.format(traceback.format_exc()))
    except KeyboardInterrupt:
        logger.info('Process manually interupted at %s' % arrow.now(
            tz=ConfManage.getString("ARROW_TIMEZONE")).format(loggable))
    logger.info('Releasing Logger...')
    # Logger.release_instance()
    return 0
Beispiel #6
0
def trim_data(logger, date, reverse=False):
    """
    根据日期删除数据, 当reverse为True时,删除日期之前的数据,否则删除之后的数据
    """
    pickle_name = 'data'
    pickled = load_pickle(pickle_name)
    if pickled is None:
        return
    if type(date) == tuple:
        pickled1 = pickled.loc[pickled['order_time'] < date[0]]
        pickled2 = pickled.loc[pickled['order_time'] > date[1]]
    else:
        if reverse is True:
            date = date.ceil('day')
            pickled = pickled.loc[pickled['order_time'] > date]
        else:
            date = date.floor('day')
            pickled = pickled.loc[pickled['order_time'] < date]
    if save_pickle(pickled, pickle_name):
        logger.info('Successfully Trimmed {} data!, reverse={}'.format(
            date.format(loggable), reverse))
Beispiel #7
0
    def load_model_cache(self, name='undefined', using_joblib=False):

        cache_key = 'pickle_cache_{}'.format(name)
        ret = self.cache.get(cache_key)
        if ret is None:
            logger.debug('load_pickle_cache, fetch from raw pickle')
            path = "pickles/{app_mode}-{zone}-{estmator_key}".format(app_mode=ConfManage.getString("APP_MODE"),
                                                                     zone=ConfManage.getString("ZONE"),
                                                                     estmator_key=name)
            if name[:3] == "xgb":
                ret = xgb.Booster(model_file=path)
            elif name[:2] == "tf":
                ret = tf.keras.models.load_model(path, compile=False)
                ret.compile(optimizer=self.estimator().get_optimizer(), loss=self.estimator().loss_class,  # todo:self.estimator()未初始化设置
                            metrics=['mae', 'mse'])
        else:
            ret = load_pickle(name, using_joblib)
        if ret is not None:
            cached = self.cache.set(cache_key, ret, ConfManage.getInt("PICKLE_CACHE_EXPIRE"))
            logger.debug('load_pickle_cache, set cache, cache_key={}, status={}'.format(cache_key, cached))
        else:
            logger.debug('load_pickle_cache, fetch from cache, cache_key={}'.format(cache_key))
        return ret
Beispiel #8
0
def main():
    """Obtain Information from Data-API and MySQL Database"""
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--clear', \
                        help='Clear previously saved pickles.', action='store_true')
    parser.add_argument('-r', '--reverse', \
                        help='whether clear previously data.', action='store_true')
    parser.add_argument('-d', '--date', \
                        help='Date used for calculation.', type=str)
    parser.add_argument('-p', '--pickle', type=str, \
                        help='Pickle name for saving latest data-collection.', default='data')
    parser.add_argument('-u', '--updata', type=bool, \
                        help='Merge data with new feature to data.pkl.', default=False)
    parser.add_argument('-f', '--funtion', \
                        help='Update new feature from funtion.')
    parser.add_argument('-m', '--merge_on', \
                        help='Field names to join on. Must be found in both DataFrames.')
    args = parser.parse_args()

    # update date
    if args.updata:
        data = load_pickle(args.pickle)
        if args.funtion is not None and args.merge_on is not None:
            update_data(logger, args.funtion, data, args.merge_on, args.pickle)
        else:
            logger.error(
                'Funtion and Merge_on is None, Please provide corresponding parameters'
            )
        return

    # 清除所有子pkl。
    if args.clear:
        clear_pickles(logger)
        return

    is_reverse = False if args.reverse is None else True  # 向前或向后收集数据
    run_time = get_run_time(None, 0, False)
    logger.info('Run-Time: %s' % run_time)
    collect_date = None if args.date is None else get_run_time(args.date)
    logger.info('Collect-Date: %s' % collect_date)
    # 数据最后时间
    end_time = run_time.shift(days=-1).ceil('day')
    logger.info('End-Time: %s' % end_time)

    pickled = load_pickle(args.pickle)
    collected_start_time = None
    collected_count = 0
    if pickled is not None and isinstance(
            pickled, pd.DataFrame) and 'time' in pickled.columns:
        times = pickled['time']
        del pickled
        collected_count = len(
            times.apply(lambda order_time: order_time.date()).unique())
        collected_start_time = times.min()
        logger.info('Min collected order_time Date: %s' %
                    collected_start_time.format(loggable))
        collected_end_time = times.max()
        logger.info('Max collected order_time Date: %s' %
                    collected_end_time.format(loggable))

        if collect_date is not None:
            if collect_date > end_time:
                logger.warning(
                    'collect_date can not greater then end_time {} > {}'.
                    format(collect_date.format(loggable),
                           end_time.format(loggable)))
                return
            if collect_date < collected_start_time.floor('day'):
                start_time = collect_date.floor('day')
                end_time = collected_start_time.shift(days=-1).ceil('day')
            elif collect_date > collected_end_time.ceil('day'):
                start_time = collected_end_time.shift(days=1).floor('day')
                end_time = collect_date.ceil('day')
            else:
                logger.warning('collect_data invalid. {}'.format(collect_date))
        else:
            if collected_end_time >= end_time:
                logger.info('Targeted Run-Time already in Collection-Interval')
                return
            else:
                start_time = collected_end_time.shift(days=1).floor('day')

        gap = start_time.shift(days=-1).date() - end_time.date()
        gap_days = gap.days

    else:
        logger.info('Data empty!')
        gap_days = -ConfManage.getInt("COLLECTION_GAP")
        start_time = end_time.shift(days=gap_days + 1).floor('day')

    logger.info('Total Collection Interval: %d/%d [%s - %s]' %
                (collected_count, ConfManage.getInt("COLLECTION_INTERVAL"),
                 start_time.format(loggable), end_time.format(loggable)))

    if gap_days >= 0:
        logger.info('Targeted Run-Time already in Collection-Interval')
        return

    logger.info('Gap: %d' % (gap_days))
    logger.info(
        'Gap Interval: %d [%s - %s]' %
        (gap_days, start_time.format(loggable), end_time.format(loggable)))
    try:
        # 针对缺失1天以上的数据进行每日收集
        for i in range(-gap_days, 0, -1):
            end_time = start_time.ceil('day')
            logger.info('Collecting data in [{} - {}]'.format(
                start_time.format(loggable), end_time.format(loggable)))
            collect(logger, start_time, end_time, args.pickle)
            logger.info('Success collect data in [{} - {}] \n\n'.format(
                start_time.format(loggable), end_time.format(loggable)))
            start_time = start_time.shift(days=1)
        trim_outdated(logger, run_time, args.pickle)  # 没有环境变量下,默认截取最近30天的数据
    except (AttributeError, ValueError) as err:
        logger.error(err)
        logger.error('Trace: {}'.format(traceback.format_exc()))
    except KeyboardInterrupt:
        logger.info('Process manually interupted at {}'.format(arrow.utcnow()))
    logger.info('Releasing Logger...')
    return 0
Beispiel #9
0
def process(logger, pickle, estimator, predict_target, withhold, date=None, shift_days=-1):
    run_time = get_run_time(date, shift_days=shift_days)
    logger.info('Run-Time: %s' % run_time.format(loggable))
    end_time = run_time.ceil('day').ceil('hour').ceil('minute').ceil('second')
    logger.info('Targeted Processing Interval [%s - %s]' % (run_time.format(loggable), end_time.format(loggable)))
    estimator_name = estimator
    predict_target = predict_target

    # 导入对应的类
    module_tmp = importlib.import_module('tools.eta.{}_{}'.format(estimator_name, predict_target))
    class_tmp = getattr(module_tmp, '{}{}'.format(estimator_name.capitalize(), predict_target.capitalize()))
    estimator = class_tmp()
    features = sorted(estimator.features)
    # 导入数据
    data = load_pickle(pickle)
    data = data.loc[data.order_time < end_time]
    # 抽取一天数据
    original_test_data = data.loc[(data['time'] > run_time) & (data['time'] < end_time)]
    # 数据处理
    original_test_data = estimator.etl(original_test_data)

    predict_data = original_test_data.loc[:, features]
    # 模型预测:
    try:
        predict_value = estimator.predict(predict_data)
    except XGBoostError as err:
        logger.error('predict XGBoostError: {}'.format(err))
        raise err
    except Exception as err:
        logger.error('Other error: %s' % err)
        raise err

    original_test_data["original_predict_value"] = predict_value
    del predict_value
    original_test_data_with_pre = original_test_data
    del original_test_data
    # 剔除不合理数据(data变成已经去掉异常数据):
    data = estimator.filter_data(data, run_time, end_time)
    if data is not None and 'order_time' in data.columns:
        filter_data = data.loc[(data['order_time'] > run_time) & (data['order_time'] < end_time)]
        del data
        filter_data['isvalid'] = 1
        logger.info('Test-Data already in Collection. Test-Data after filter Count: %d' % len(filter_data))

        # 单天所有数据,包含已补时后预测值和原始预测值,且标记是否有效
        original_amend_test_data_with_pre = estimator.amend_time(original_test_data_with_pre) # todo:补时
        original_amend_test_data_with_pre = pd.merge(original_amend_test_data_with_pre,
                                                     filter_data.loc[:, ["id", "isvalid", "time"]], how="left",
                                                     on=["id", "time"])
        # 已补时的有效数据且包含未补时值:
        valid_amend_test_data_with_pre = original_amend_test_data_with_pre.loc[original_amend_test_data_with_pre.isvalid == 1]

        # 补时前测试集数据统计指标:mae,mse,r2,N分钟准确率等
        original_valid_test_results = estimator.test_results(valid_amend_test_data_with_pre,
                                                             pd.Series(valid_amend_test_data_with_pre.loc[:, predict_target]),
                                                             pd.Series(valid_amend_test_data_with_pre.original_predict_value)
                                                             )
        logger.info('test_results={}'.format(original_valid_test_results))
        save_pickle(original_valid_test_results, '%s_%s_original_test_results' % (estimator_name, predict_target))
        # 补时后测试集数据统计指标:mae,mse,r2,N分钟准确率等
        valid_test_results = estimator.test_results(valid_amend_test_data_with_pre,
                                              pd.Series(valid_amend_test_data_with_pre.loc[:, predict_target]),
                                              pd.Series(valid_amend_test_data_with_pre.predict_value)
                                              )
        logger.info('test_results={}'.format(valid_test_results))
        save_pickle(valid_test_results, '%s_%s_test_results' % (estimator_name, predict_target))
        if withhold:
            logger.info('withhold={}'.format(withhold))
            estimator.save_DB(original_amend_test_data_with_pre, valid_test_results)
        Logger.resource_checkpoint('after-process')