def save_data(timelines, filename, logger, features=None): """ split data :param timelines: :param filename: :param logger: :param features: :return: """ data = load_pickle(filename) if features is not None: save_features = timelines.loc[:, features] else: save_features = timelines if data is not None: if data.time.astype(int).max() < save_features.time.astype(int).min(): data = data.append(save_features, sort=False) logger.info('Successfully appended new data to %s!' % filename) else: logger.info( '%s: Targeted Run-Time already in Collection-Interval!' % filename) del data return False else: data = save_features isSave = save_pickle(data, filename) del data return isSave
def fetch_data(start_time, end_time, table, topic, columns=None, record_path=None, meta=None, save_file_prefix=""): df = load_pickle(save_file_prefix + topic) if df is None: df = collect_batch_data(start_time=start_time, end_time=end_time, table=table, topic=topic, columns=columns, record_path=record_path, meta=meta) save_pickle(df, save_file_prefix + topic) else: if len(df) == 0: df = collect_batch_data(start_time=start_time, end_time=end_time, table=table, topic=topic, columns=columns, record_path=record_path, meta=meta) save_pickle(df, save_file_prefix + topic) logger.info('Fetch %s (Count): %d' % (topic, len(df))) return df
def trim_outdated(logger, run_time, pickle_name): pickled = load_pickle(pickle_name) if pickled is None: return interval_begin = run_time.shift( days=-ConfManage.getInt("COLLECTION_INTERVAL")) pickled = pickled.loc[pickled['order_time'] > interval_begin.ceil('day')] if save_pickle(pickled, pickle_name): logger.info('Successfully Trimmed outdated! [ {} - {} ]'.format( interval_begin.shift(days=1).floor('day').format(loggable), run_time.format(loggable)))
def fetch_data(start_time, end_time, table, topic, columns): df = load_pickle("stream_" + topic) if df is None: df = collect_batch_data(start_time=start_time, end_time=end_time, table=table, topic=topic, columns=columns) save_pickle(df, "stream_" + topic) return df
def preprocess(date, pickle, estimator, predict_target, holdout, mode, shift_days): data = load_pickle(pickle) try: run_time = get_run_time(date) logger.info('Run-Time: %s' % run_time.format(loggable)) run_time = run_time.shift(days=shift_days).ceil('day').ceil( 'hour').ceil('minute').ceil('second') start_time = run_time.shift( days=-ConfManage.getInt("TRAINING_INTERVAL")).floor('day').floor( 'hour').floor('minute').floor('second') logger.info('Targeted Training Interval %d [%s - %s]' % \ (ConfManage.getInt("TRAINING_INTERVAL"), start_time.format(loggable), run_time.format(loggable))) logger.info('Preprocessing with Estimator %s (%s)' % (estimator, mode)) # 导入eta类: module_tmp = importlib.import_module('tools.eta.{}_{}'.format( estimator, predict_target)) class_tmp = getattr( module_tmp, '{}{}'.format(estimator.capitalize(), predict_target.capitalize())) estimator_obj = class_tmp() # 数据处理 data = estimator_obj.etl(data) # 去除异常值 data = estimator_obj.filter_data(data) if data is not None and 'time' in data.columns: # 选取某段时间数据 data = data.loc[(data.order_time > start_time) & (data.order_time < run_time)] order_times = data.order_time interval_count = len( order_times.apply( lambda order_time: order_time.date()).unique()) logger.info('Available Training Interval %d/%d [%s - %s]' % (interval_count, ConfManage.getInt("TRAINING_INTERVAL"), \ order_times.min().format(loggable), order_times.max().format(loggable))) # 模型训练: estimator_obj.preprocess(data, mode, holdout) Logger.resource_checkpoint('post-preprocess') else: raise Exception( "Data not yet obtained. Please run `python collect.py` first!") except (AttributeError, ValueError) as err: logger.error(err) logger.error('Trace: {}'.format(traceback.format_exc())) except KeyboardInterrupt: logger.info('Process manually interupted at %s' % arrow.now( tz=ConfManage.getString("ARROW_TIMEZONE")).format(loggable)) logger.info('Releasing Logger...') # Logger.release_instance() return 0
def trim_data(logger, date, reverse=False): """ 根据日期删除数据, 当reverse为True时,删除日期之前的数据,否则删除之后的数据 """ pickle_name = 'data' pickled = load_pickle(pickle_name) if pickled is None: return if type(date) == tuple: pickled1 = pickled.loc[pickled['order_time'] < date[0]] pickled2 = pickled.loc[pickled['order_time'] > date[1]] else: if reverse is True: date = date.ceil('day') pickled = pickled.loc[pickled['order_time'] > date] else: date = date.floor('day') pickled = pickled.loc[pickled['order_time'] < date] if save_pickle(pickled, pickle_name): logger.info('Successfully Trimmed {} data!, reverse={}'.format( date.format(loggable), reverse))
def load_model_cache(self, name='undefined', using_joblib=False): cache_key = 'pickle_cache_{}'.format(name) ret = self.cache.get(cache_key) if ret is None: logger.debug('load_pickle_cache, fetch from raw pickle') path = "pickles/{app_mode}-{zone}-{estmator_key}".format(app_mode=ConfManage.getString("APP_MODE"), zone=ConfManage.getString("ZONE"), estmator_key=name) if name[:3] == "xgb": ret = xgb.Booster(model_file=path) elif name[:2] == "tf": ret = tf.keras.models.load_model(path, compile=False) ret.compile(optimizer=self.estimator().get_optimizer(), loss=self.estimator().loss_class, # todo:self.estimator()未初始化设置 metrics=['mae', 'mse']) else: ret = load_pickle(name, using_joblib) if ret is not None: cached = self.cache.set(cache_key, ret, ConfManage.getInt("PICKLE_CACHE_EXPIRE")) logger.debug('load_pickle_cache, set cache, cache_key={}, status={}'.format(cache_key, cached)) else: logger.debug('load_pickle_cache, fetch from cache, cache_key={}'.format(cache_key)) return ret
def main(): """Obtain Information from Data-API and MySQL Database""" parser = argparse.ArgumentParser() parser.add_argument('-c', '--clear', \ help='Clear previously saved pickles.', action='store_true') parser.add_argument('-r', '--reverse', \ help='whether clear previously data.', action='store_true') parser.add_argument('-d', '--date', \ help='Date used for calculation.', type=str) parser.add_argument('-p', '--pickle', type=str, \ help='Pickle name for saving latest data-collection.', default='data') parser.add_argument('-u', '--updata', type=bool, \ help='Merge data with new feature to data.pkl.', default=False) parser.add_argument('-f', '--funtion', \ help='Update new feature from funtion.') parser.add_argument('-m', '--merge_on', \ help='Field names to join on. Must be found in both DataFrames.') args = parser.parse_args() # update date if args.updata: data = load_pickle(args.pickle) if args.funtion is not None and args.merge_on is not None: update_data(logger, args.funtion, data, args.merge_on, args.pickle) else: logger.error( 'Funtion and Merge_on is None, Please provide corresponding parameters' ) return # 清除所有子pkl。 if args.clear: clear_pickles(logger) return is_reverse = False if args.reverse is None else True # 向前或向后收集数据 run_time = get_run_time(None, 0, False) logger.info('Run-Time: %s' % run_time) collect_date = None if args.date is None else get_run_time(args.date) logger.info('Collect-Date: %s' % collect_date) # 数据最后时间 end_time = run_time.shift(days=-1).ceil('day') logger.info('End-Time: %s' % end_time) pickled = load_pickle(args.pickle) collected_start_time = None collected_count = 0 if pickled is not None and isinstance( pickled, pd.DataFrame) and 'time' in pickled.columns: times = pickled['time'] del pickled collected_count = len( times.apply(lambda order_time: order_time.date()).unique()) collected_start_time = times.min() logger.info('Min collected order_time Date: %s' % collected_start_time.format(loggable)) collected_end_time = times.max() logger.info('Max collected order_time Date: %s' % collected_end_time.format(loggable)) if collect_date is not None: if collect_date > end_time: logger.warning( 'collect_date can not greater then end_time {} > {}'. format(collect_date.format(loggable), end_time.format(loggable))) return if collect_date < collected_start_time.floor('day'): start_time = collect_date.floor('day') end_time = collected_start_time.shift(days=-1).ceil('day') elif collect_date > collected_end_time.ceil('day'): start_time = collected_end_time.shift(days=1).floor('day') end_time = collect_date.ceil('day') else: logger.warning('collect_data invalid. {}'.format(collect_date)) else: if collected_end_time >= end_time: logger.info('Targeted Run-Time already in Collection-Interval') return else: start_time = collected_end_time.shift(days=1).floor('day') gap = start_time.shift(days=-1).date() - end_time.date() gap_days = gap.days else: logger.info('Data empty!') gap_days = -ConfManage.getInt("COLLECTION_GAP") start_time = end_time.shift(days=gap_days + 1).floor('day') logger.info('Total Collection Interval: %d/%d [%s - %s]' % (collected_count, ConfManage.getInt("COLLECTION_INTERVAL"), start_time.format(loggable), end_time.format(loggable))) if gap_days >= 0: logger.info('Targeted Run-Time already in Collection-Interval') return logger.info('Gap: %d' % (gap_days)) logger.info( 'Gap Interval: %d [%s - %s]' % (gap_days, start_time.format(loggable), end_time.format(loggable))) try: # 针对缺失1天以上的数据进行每日收集 for i in range(-gap_days, 0, -1): end_time = start_time.ceil('day') logger.info('Collecting data in [{} - {}]'.format( start_time.format(loggable), end_time.format(loggable))) collect(logger, start_time, end_time, args.pickle) logger.info('Success collect data in [{} - {}] \n\n'.format( start_time.format(loggable), end_time.format(loggable))) start_time = start_time.shift(days=1) trim_outdated(logger, run_time, args.pickle) # 没有环境变量下,默认截取最近30天的数据 except (AttributeError, ValueError) as err: logger.error(err) logger.error('Trace: {}'.format(traceback.format_exc())) except KeyboardInterrupt: logger.info('Process manually interupted at {}'.format(arrow.utcnow())) logger.info('Releasing Logger...') return 0
def process(logger, pickle, estimator, predict_target, withhold, date=None, shift_days=-1): run_time = get_run_time(date, shift_days=shift_days) logger.info('Run-Time: %s' % run_time.format(loggable)) end_time = run_time.ceil('day').ceil('hour').ceil('minute').ceil('second') logger.info('Targeted Processing Interval [%s - %s]' % (run_time.format(loggable), end_time.format(loggable))) estimator_name = estimator predict_target = predict_target # 导入对应的类 module_tmp = importlib.import_module('tools.eta.{}_{}'.format(estimator_name, predict_target)) class_tmp = getattr(module_tmp, '{}{}'.format(estimator_name.capitalize(), predict_target.capitalize())) estimator = class_tmp() features = sorted(estimator.features) # 导入数据 data = load_pickle(pickle) data = data.loc[data.order_time < end_time] # 抽取一天数据 original_test_data = data.loc[(data['time'] > run_time) & (data['time'] < end_time)] # 数据处理 original_test_data = estimator.etl(original_test_data) predict_data = original_test_data.loc[:, features] # 模型预测: try: predict_value = estimator.predict(predict_data) except XGBoostError as err: logger.error('predict XGBoostError: {}'.format(err)) raise err except Exception as err: logger.error('Other error: %s' % err) raise err original_test_data["original_predict_value"] = predict_value del predict_value original_test_data_with_pre = original_test_data del original_test_data # 剔除不合理数据(data变成已经去掉异常数据): data = estimator.filter_data(data, run_time, end_time) if data is not None and 'order_time' in data.columns: filter_data = data.loc[(data['order_time'] > run_time) & (data['order_time'] < end_time)] del data filter_data['isvalid'] = 1 logger.info('Test-Data already in Collection. Test-Data after filter Count: %d' % len(filter_data)) # 单天所有数据,包含已补时后预测值和原始预测值,且标记是否有效 original_amend_test_data_with_pre = estimator.amend_time(original_test_data_with_pre) # todo:补时 original_amend_test_data_with_pre = pd.merge(original_amend_test_data_with_pre, filter_data.loc[:, ["id", "isvalid", "time"]], how="left", on=["id", "time"]) # 已补时的有效数据且包含未补时值: valid_amend_test_data_with_pre = original_amend_test_data_with_pre.loc[original_amend_test_data_with_pre.isvalid == 1] # 补时前测试集数据统计指标:mae,mse,r2,N分钟准确率等 original_valid_test_results = estimator.test_results(valid_amend_test_data_with_pre, pd.Series(valid_amend_test_data_with_pre.loc[:, predict_target]), pd.Series(valid_amend_test_data_with_pre.original_predict_value) ) logger.info('test_results={}'.format(original_valid_test_results)) save_pickle(original_valid_test_results, '%s_%s_original_test_results' % (estimator_name, predict_target)) # 补时后测试集数据统计指标:mae,mse,r2,N分钟准确率等 valid_test_results = estimator.test_results(valid_amend_test_data_with_pre, pd.Series(valid_amend_test_data_with_pre.loc[:, predict_target]), pd.Series(valid_amend_test_data_with_pre.predict_value) ) logger.info('test_results={}'.format(valid_test_results)) save_pickle(valid_test_results, '%s_%s_test_results' % (estimator_name, predict_target)) if withhold: logger.info('withhold={}'.format(withhold)) estimator.save_DB(original_amend_test_data_with_pre, valid_test_results) Logger.resource_checkpoint('after-process')