def process(rdd, p_schema, step_conf, cache_conf):
        """
        输入: RDD[Row]
        输出: RDD[Row]
        """
        ret_rdd = rdd.mapPartitions(
            lambda iter_x: EnhanceUserInfoProcessor.fun_userinfo_in_rdd_mapPartitions(iter_x, step_conf, cache_conf)
        ).filter(lambda row: row is not None)

        remove_key_list = step_conf.get('remove.key.list', [])
        add_key_list = step_conf.get('add.key.list', [])
        schema_new = update_df_schema(p_schema, remove_key_list, add_key_list)

        return ret_rdd, schema_new
Example #2
0
    def process(rdd, p_schema, step_conf, cache_conf):
        """
        输入: RDD[Row]
        输出: RDD[Row]
        """
        ret_rdd = rdd.mapPartitions(lambda iter_x: EnhanceUserInfoProcessor.
                                    fun_userinfo_in_rdd_mapPartitions(
                                        iter_x, step_conf, cache_conf)).filter(
                                            lambda row: row is not None)

        remove_key_list = step_conf.get('remove.key.list', [])
        add_key_list = step_conf.get('add.key.list', [])
        schema_new = update_df_schema(p_schema, remove_key_list, add_key_list)

        return ret_rdd, schema_new
    def process(rdd, p_schema, step_conf, cache_conf):
        """
        输入: RDD[Row]
        输出: RDD[Row]
        """

        ret_rdd = rdd.mapPartitions(
            lambda iter_x: DeDuplicateProcessor.fun_deduplicate_in_rdd_mapPartitions(iter_x, step_conf, cache_conf)
        ).filter(lambda row: row is not None)

        remove_key_list = step_conf.get('remove.key.list', [])
        add_key_list = step_conf.get('add.key.list', [])
        schema_new = update_df_schema(p_schema, remove_key_list, add_key_list)

        print('= = ' * 10, '[myapp DeDuplicateProcessor.process] ret_rdd.is_cached = ', ret_rdd.is_cached)
        # if not ret_rdd.is_cached:
        #     ret_rdd.persist()
        return ret_rdd, schema_new
    def process(rdd, p_schema, step_conf, cache_conf=None):
        """
        输入: Row[**obj]
        输出: Row[**obj]
        """
        time_key = step_conf['timeKeyName']
        # TODO: 新增 hour 级别配置
        time_interval_minutes_list = step_conf['add.timeKeyInterval.minutes.list']
        time_key_prefix = step_conf['add.timeKey.prefix']

        ret_rdd = rdd.map(
            lambda row: EnhanceTimeProcessor.fun_time_in_rdd_map(
                row, time_key, time_interval_minutes_list, time_key_prefix)
        ).filter(lambda row: row is not None)

        remove_key_list = step_conf.get('remove.key.list', [])
        add_key_list = step_conf.get('add.key.list', [])
        schema_new = update_df_schema(p_schema, remove_key_list, add_key_list)

        return ret_rdd, schema_new
Example #5
0
    def process(rdd, p_schema, step_conf, cache_conf=None):
        """
        输入: Row[**obj]
        输出: Row[**obj]
        """
        time_key = step_conf['timeKeyName']
        # TODO: 新增 hour 级别配置
        time_interval_minutes_list = step_conf[
            'add.timeKeyInterval.minutes.list']
        time_key_prefix = step_conf['add.timeKey.prefix']

        ret_rdd = rdd.map(lambda row: EnhanceTimeProcessor.fun_time_in_rdd_map(
            row, time_key, time_interval_minutes_list, time_key_prefix)
                          ).filter(lambda row: row is not None)

        remove_key_list = step_conf.get('remove.key.list', [])
        add_key_list = step_conf.get('add.key.list', [])
        schema_new = update_df_schema(p_schema, remove_key_list, add_key_list)

        return ret_rdd, schema_new
Example #6
0
    def process(rdd, p_schema, step_conf, cache_conf):
        """
        输入: RDD[Row]
        输出: RDD[Row]
        """

        ret_rdd = rdd.mapPartitions(lambda iter_x: DeDuplicateProcessor.
                                    fun_deduplicate_in_rdd_mapPartitions(
                                        iter_x, step_conf, cache_conf)).filter(
                                            lambda row: row is not None)

        remove_key_list = step_conf.get('remove.key.list', [])
        add_key_list = step_conf.get('add.key.list', [])
        schema_new = update_df_schema(p_schema, remove_key_list, add_key_list)

        print('= = ' * 10,
              '[myapp DeDuplicateProcessor.process] ret_rdd.is_cached = ',
              ret_rdd.is_cached)
        # if not ret_rdd.is_cached:
        #     ret_rdd.persist()
        return ret_rdd, schema_new