def process(rdd, p_schema, step_conf, cache_conf): """ 输入: RDD[Row] 输出: RDD[Row] """ ret_rdd = rdd.mapPartitions( lambda iter_x: EnhanceUserInfoProcessor.fun_userinfo_in_rdd_mapPartitions(iter_x, step_conf, cache_conf) ).filter(lambda row: row is not None) remove_key_list = step_conf.get('remove.key.list', []) add_key_list = step_conf.get('add.key.list', []) schema_new = update_df_schema(p_schema, remove_key_list, add_key_list) return ret_rdd, schema_new
def process(rdd, p_schema, step_conf, cache_conf): """ 输入: RDD[Row] 输出: RDD[Row] """ ret_rdd = rdd.mapPartitions(lambda iter_x: EnhanceUserInfoProcessor. fun_userinfo_in_rdd_mapPartitions( iter_x, step_conf, cache_conf)).filter( lambda row: row is not None) remove_key_list = step_conf.get('remove.key.list', []) add_key_list = step_conf.get('add.key.list', []) schema_new = update_df_schema(p_schema, remove_key_list, add_key_list) return ret_rdd, schema_new
def process(rdd, p_schema, step_conf, cache_conf): """ 输入: RDD[Row] 输出: RDD[Row] """ ret_rdd = rdd.mapPartitions( lambda iter_x: DeDuplicateProcessor.fun_deduplicate_in_rdd_mapPartitions(iter_x, step_conf, cache_conf) ).filter(lambda row: row is not None) remove_key_list = step_conf.get('remove.key.list', []) add_key_list = step_conf.get('add.key.list', []) schema_new = update_df_schema(p_schema, remove_key_list, add_key_list) print('= = ' * 10, '[myapp DeDuplicateProcessor.process] ret_rdd.is_cached = ', ret_rdd.is_cached) # if not ret_rdd.is_cached: # ret_rdd.persist() return ret_rdd, schema_new
def process(rdd, p_schema, step_conf, cache_conf=None): """ 输入: Row[**obj] 输出: Row[**obj] """ time_key = step_conf['timeKeyName'] # TODO: 新增 hour 级别配置 time_interval_minutes_list = step_conf['add.timeKeyInterval.minutes.list'] time_key_prefix = step_conf['add.timeKey.prefix'] ret_rdd = rdd.map( lambda row: EnhanceTimeProcessor.fun_time_in_rdd_map( row, time_key, time_interval_minutes_list, time_key_prefix) ).filter(lambda row: row is not None) remove_key_list = step_conf.get('remove.key.list', []) add_key_list = step_conf.get('add.key.list', []) schema_new = update_df_schema(p_schema, remove_key_list, add_key_list) return ret_rdd, schema_new
def process(rdd, p_schema, step_conf, cache_conf=None): """ 输入: Row[**obj] 输出: Row[**obj] """ time_key = step_conf['timeKeyName'] # TODO: 新增 hour 级别配置 time_interval_minutes_list = step_conf[ 'add.timeKeyInterval.minutes.list'] time_key_prefix = step_conf['add.timeKey.prefix'] ret_rdd = rdd.map(lambda row: EnhanceTimeProcessor.fun_time_in_rdd_map( row, time_key, time_interval_minutes_list, time_key_prefix) ).filter(lambda row: row is not None) remove_key_list = step_conf.get('remove.key.list', []) add_key_list = step_conf.get('add.key.list', []) schema_new = update_df_schema(p_schema, remove_key_list, add_key_list) return ret_rdd, schema_new
def process(rdd, p_schema, step_conf, cache_conf): """ 输入: RDD[Row] 输出: RDD[Row] """ ret_rdd = rdd.mapPartitions(lambda iter_x: DeDuplicateProcessor. fun_deduplicate_in_rdd_mapPartitions( iter_x, step_conf, cache_conf)).filter( lambda row: row is not None) remove_key_list = step_conf.get('remove.key.list', []) add_key_list = step_conf.get('add.key.list', []) schema_new = update_df_schema(p_schema, remove_key_list, add_key_list) print('= = ' * 10, '[myapp DeDuplicateProcessor.process] ret_rdd.is_cached = ', ret_rdd.is_cached) # if not ret_rdd.is_cached: # ret_rdd.persist() return ret_rdd, schema_new