Example #1
0
def preprocess(date, pickle, estimator, predict_target, holdout, mode,
               shift_days):
    data = load_pickle(pickle)
    try:
        run_time = get_run_time(date)
        logger.info('Run-Time: %s' % run_time.format(loggable))
        run_time = run_time.shift(days=shift_days).ceil('day').ceil(
            'hour').ceil('minute').ceil('second')
        start_time = run_time.shift(
            days=-ConfManage.getInt("TRAINING_INTERVAL")).floor('day').floor(
                'hour').floor('minute').floor('second')
        logger.info('Targeted Training Interval %d [%s - %s]' % \
                    (ConfManage.getInt("TRAINING_INTERVAL"), start_time.format(loggable), run_time.format(loggable)))
        logger.info('Preprocessing with Estimator %s (%s)' % (estimator, mode))
        # 导入eta类:
        module_tmp = importlib.import_module('tools.eta.{}_{}'.format(
            estimator, predict_target))
        class_tmp = getattr(
            module_tmp, '{}{}'.format(estimator.capitalize(),
                                      predict_target.capitalize()))
        estimator_obj = class_tmp()

        # 数据处理
        data = estimator_obj.etl(data)
        # 去除异常值
        data = estimator_obj.filter_data(data)
        if data is not None and 'time' in data.columns:
            # 选取某段时间数据
            data = data.loc[(data.order_time > start_time)
                            & (data.order_time < run_time)]
            order_times = data.order_time
            interval_count = len(
                order_times.apply(
                    lambda order_time: order_time.date()).unique())
            logger.info('Available Training Interval %d/%d [%s - %s]' % (interval_count, ConfManage.getInt("TRAINING_INTERVAL"), \
                         order_times.min().format(loggable), order_times.max().format(loggable)))
            # 模型训练:
            estimator_obj.preprocess(data, mode, holdout)
            Logger.resource_checkpoint('post-preprocess')
        else:
            raise Exception(
                "Data not yet obtained. Please run `python collect.py` first!")
    except (AttributeError, ValueError) as err:
        logger.error(err)
        logger.error('Trace: {}'.format(traceback.format_exc()))
    except KeyboardInterrupt:
        logger.info('Process manually interupted at %s' % arrow.now(
            tz=ConfManage.getString("ARROW_TIMEZONE")).format(loggable))
    logger.info('Releasing Logger...')
    # Logger.release_instance()
    return 0
Example #2
0
def main():
    """Obtain Information from Data-API and MySQL Database"""
    logger = Logger.get_instance(ConfManage.getString("LOG_CRON_NAME"))
    Logger.resource_checkpoint('init')
    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--clear',
                        help='Clear previously saved pickles.',
                        action='store_true')
    parser.add_argument('-r',
                        '--reverse',
                        help='whether clear previously data.',
                        action='store_true')
    parser.add_argument('-d',
                        '--date',
                        help='Date used for calculation.',
                        type=str)
    parser.add_argument('-p',
                        '--pickle',
                        type=str,
                        default='stream_data',
                        help='Pickle name for saving latest data-collection.')
    parser.add_argument('-u',
                        '--updata',
                        type=bool,
                        help='Merge data with new feature to data.pkl.',
                        default=False)
    parser.add_argument('-f',
                        '--funtion',
                        help='Update new feature from funtion.')
    parser.add_argument(
        '-m',
        '--merge_on',
        help='Field names to join on. Must be found in both DataFrames.')
    args = parser.parse_args()

    # 更新新的数据:
    if args.updata:
        data = load_pickle(args.pickle)
        if args.funtion is not None and args.merge_on is not None:
            update_data(logger, args.funtion, data, args.merge_on)
        else:
            logger.error(
                'Funtion and Merge_on is None, Please provide corresponding parameters'
            )
        return
    # 清除所有子pkl:
    if args.clear:
        clear_pickles(logger)
        return

    is_reverse = False if args.reverse is None else True  # 向前或向后收集数据
    # 处理时间段
    run_time = get_run_time(None, 0, False)
    logger.info('Run-Time: %s' % run_time)
    collect_date = None if args.date is None else get_run_time(args.date)
    logger.info('Collect-Date: %s' % collect_date)
    end_time = run_time.shift(days=-1).ceil('day')
    logger.info('End-Time: %s' % end_time)
    # 读取数据
    pickled = load_pickle(args.pickle)
    collected_count = 0
    if pickled is not None and isinstance(
            pickled, pd.DataFrame) and 'order_time' in pickled.columns:
        order_times = pickled['order_time']
        del pickled  # Release pickle
        collected_count = len(
            order_times.apply(lambda order_time: order_time.date()).unique())
        collected_start_time = order_times.min()
        logger.info('Min collected order_time Date: %s' %
                    collected_start_time.format(loggable))
        collected_end_time = order_times.max()
        logger.info('Max collected order_time Date: %s' %
                    collected_end_time.format(loggable))

        if collect_date is not None:
            if collect_date > end_time:
                logger.warning(
                    'collect_date can not greater then end_time {} > {}'.
                    format(collect_date.format(loggable),
                           end_time.format(loggable)))
                return
            if collect_date < collected_start_time.floor('day'):
                start_time = collect_date.floor('day')
                end_time = collected_start_time.shift(days=-1).ceil('day')
            elif collected_start_time.floor(
                    'day') <= collect_date <= collected_end_time.ceil('day'):
                trim_data(logger, collect_date, is_reverse)
                if is_reverse:
                    start_time = collected_start_time.floor('day')
                    end_time = collect_date.ceil('day')
                else:
                    start_time = collect_date.floor('day')
                    end_time = collected_end_time.ceil('day')
            elif collect_date > collected_end_time.ceil('day'):
                start_time = collected_end_time.shift(days=1).floor('day')
                end_time = collect_date.ceil('day')
            else:
                logger.warning('collect_data invalid. {}'.format(collect_date))
                return
        else:
            if collected_end_time >= end_time:
                logger.info('Targeted Run-Time already in Collection-Interval')
                return
            else:
                start_time = collected_end_time.shift(days=1).floor('day')

        gap = start_time.shift(days=-1).date() - end_time.date()
        gap_days = gap.days

    else:
        logger.info('Data empty!')
        gap_days = -ConfManage.getInt("COLLECTION_GAP")
        start_time = end_time.shift(days=gap_days + 1).floor('day')

    logger.info('Total Collection Interval: %d/%d [%s - %s]' %
                (collected_count, ConfManage.getInt("COLLECTION_INTERVAL"),
                 start_time.format(loggable), end_time.format(loggable)))

    if gap_days >= 0:
        logger.info('Targeted Run-Time already in Collection-Interval')
        return

    logger.info('Gap: %d' % gap_days)
    logger.info(
        'Gap Interval: %d [%s - %s]' %
        (gap_days, start_time.format(loggable), end_time.format(loggable)))
    try:
        # 这部分代码是针对缺失1天以上的数据进行每日收集
        for i in range(-gap_days, 0, -1):
            end_time = start_time.ceil('day')
            logger.info('Collecting data in [{} - {}]'.format(
                start_time.format(loggable), end_time.format(loggable)))
            collect(logger, start_time, end_time, args.pickle)
            logger.info('Success collect data in [{} - {}] \n\n'.format(
                start_time.format(loggable), end_time.format(loggable)))
            start_time = start_time.shift(days=1)
        trim_outdated(logger, run_time, args.pickle)  # 没有环境变量下,默认截取最近30天的数据
    except (AttributeError, ValueError) as err:
        logger.error(err)
        logger.error('Trace: {}'.format(traceback.format_exc()))
    except KeyboardInterrupt:
        logger.info('Process manually interupted at {}'.format(arrow.utcnow()))
    logger.info('Releasing Logger...')
    return 0
Example #3
0
    parser.add_argument('predict_target',
                        help='目标值',
                        nargs='?',
                        type=str,
                        default='accept')
    parser.add_argument('--holdout', help='是否拆分3/7数据训练模型',
                        action='store_true')  # True or Flase
    parser.add_argument("-s",
                        "--shift_days",
                        help="The last few days",
                        type=int,
                        default=-1)  # True or Flase
    args = parser.parse_args()
    date = args.date
    pickle = args.pickle
    optimal = args.optimal
    estimator = args.estimator
    predict_target = args.predict_target
    holdout = args.holdout
    shift_days = args.shift_days

    mode = predict_target if optimal else 'optimal'
    logger.info('Arguments: estimator=%s, predict-target=%s, mode=%s' %
                (estimator, predict_target, mode))
    logger.info('Environment-Configs: training-interval=%d' %
                (ConfManage.getInt("TRAINING_INTERVAL")))
    Logger.resource_checkpoint('post-argparse')

    preprocess(date, pickle, estimator, predict_target, holdout, mode,
               shift_days)
Example #4
0
def process(logger, pickle, estimator, predict_target, withhold, date=None, shift_days=-1):
    run_time = get_run_time(date, shift_days=shift_days)
    logger.info('Run-Time: %s' % run_time.format(loggable))
    end_time = run_time.ceil('day').ceil('hour').ceil('minute').ceil('second')
    logger.info('Targeted Processing Interval [%s - %s]' % (run_time.format(loggable), end_time.format(loggable)))
    estimator_name = estimator
    predict_target = predict_target

    # 导入对应的类
    module_tmp = importlib.import_module('tools.eta.{}_{}'.format(estimator_name, predict_target))
    class_tmp = getattr(module_tmp, '{}{}'.format(estimator_name.capitalize(), predict_target.capitalize()))
    estimator = class_tmp()
    features = sorted(estimator.features)
    # 导入数据
    data = load_pickle(pickle)
    data = data.loc[data.order_time < end_time]
    # 抽取一天数据
    original_test_data = data.loc[(data['time'] > run_time) & (data['time'] < end_time)]
    # 数据处理
    original_test_data = estimator.etl(original_test_data)

    predict_data = original_test_data.loc[:, features]
    # 模型预测:
    try:
        predict_value = estimator.predict(predict_data)
    except XGBoostError as err:
        logger.error('predict XGBoostError: {}'.format(err))
        raise err
    except Exception as err:
        logger.error('Other error: %s' % err)
        raise err

    original_test_data["original_predict_value"] = predict_value
    del predict_value
    original_test_data_with_pre = original_test_data
    del original_test_data
    # 剔除不合理数据(data变成已经去掉异常数据):
    data = estimator.filter_data(data, run_time, end_time)
    if data is not None and 'order_time' in data.columns:
        filter_data = data.loc[(data['order_time'] > run_time) & (data['order_time'] < end_time)]
        del data
        filter_data['isvalid'] = 1
        logger.info('Test-Data already in Collection. Test-Data after filter Count: %d' % len(filter_data))

        # 单天所有数据,包含已补时后预测值和原始预测值,且标记是否有效
        original_amend_test_data_with_pre = estimator.amend_time(original_test_data_with_pre) # todo:补时
        original_amend_test_data_with_pre = pd.merge(original_amend_test_data_with_pre,
                                                     filter_data.loc[:, ["id", "isvalid", "time"]], how="left",
                                                     on=["id", "time"])
        # 已补时的有效数据且包含未补时值:
        valid_amend_test_data_with_pre = original_amend_test_data_with_pre.loc[original_amend_test_data_with_pre.isvalid == 1]

        # 补时前测试集数据统计指标:mae,mse,r2,N分钟准确率等
        original_valid_test_results = estimator.test_results(valid_amend_test_data_with_pre,
                                                             pd.Series(valid_amend_test_data_with_pre.loc[:, predict_target]),
                                                             pd.Series(valid_amend_test_data_with_pre.original_predict_value)
                                                             )
        logger.info('test_results={}'.format(original_valid_test_results))
        save_pickle(original_valid_test_results, '%s_%s_original_test_results' % (estimator_name, predict_target))
        # 补时后测试集数据统计指标:mae,mse,r2,N分钟准确率等
        valid_test_results = estimator.test_results(valid_amend_test_data_with_pre,
                                              pd.Series(valid_amend_test_data_with_pre.loc[:, predict_target]),
                                              pd.Series(valid_amend_test_data_with_pre.predict_value)
                                              )
        logger.info('test_results={}'.format(valid_test_results))
        save_pickle(valid_test_results, '%s_%s_test_results' % (estimator_name, predict_target))
        if withhold:
            logger.info('withhold={}'.format(withhold))
            estimator.save_DB(original_amend_test_data_with_pre, valid_test_results)
        Logger.resource_checkpoint('after-process')