def cache_dataset(self, sc, cache_conf):
        broadcast_enabled = cache_conf.get('broadcast.enabled', False)

        if isinstance(broadcast_enabled, bool) and broadcast_enabled:
            # cache_id = 'ds_id_' + cache_conf['source.id'] + '#cache_id_' + cache_conf['cache.id']
            cache_id = 'ds_id_' + str(cache_conf['source.id']) + "#cache_id_" + str(cache_conf['cache.id'])

            if cache_id in self.cache_pools:
                print('= = ' * 10, '[myapp CacheManager.cache_dataset] found dataset has been cached')
                return self.cache_pools[cache_id]
            else:
                host = cache_conf['host']
                port = cache_conf.get('port', 3306)
                db = cache_conf['db']
                user = cache_conf['user']
                password = cache_conf.get('password', '')

                table_name = cache_conf['tableName']
                key_name = cache_conf['keyName']
                cache_key_name_list = cache_conf['cache.keyName.list']

                cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name
                conn = MySQLUtils.get_connection(host=host, db=db, user=user, password=password, port=port)
                external_cache = MySQLUtils.query(conn, sql_text=cache_sql, key_name=key_name)

                cache_broadcast = sc.broadcast(external_cache)

                self.cache_pools[cache_id] = cache_broadcast
                return cache_broadcast
        else:
            print('= = ' * 10, '[myapp CacheManager.cache_dataset] '
                               'configuration warning: found cache is not enabled, with broadcast.enabled = ',
                  broadcast_enabled)
            return None
def get_api_deviceinfo():
    host = '192.168.9.228'
    port = 3306
    db = 'edxapp'
    user = '******'
    password = ''
    #
    table_name = 'api_deviceinfo'
    key_name = 'uuid'
    cache_key_name_list = 'channel, event, uid'
    #
    cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name
    #
    conn = MySQLUtils.get_connection(host=host, db=db, user=user, password=password, port=port)
    #
    external_cache = MySQLUtils.query(conn, sql_text=cache_sql, key_name=key_name)
    return external_cache
Esempio n. 3
0
    def cache_dataset(self, sc, cache_conf):
        broadcast_enabled = cache_conf.get('broadcast.enabled', False)

        if isinstance(broadcast_enabled, bool) and broadcast_enabled:
            # cache_id = 'ds_id_' + cache_conf['source.id'] + '#cache_id_' + cache_conf['cache.id']
            cache_id = 'ds_id_' + str(
                cache_conf['source.id']) + "#cache_id_" + str(
                    cache_conf['cache.id'])

            if cache_id in self.cache_pools:
                print(
                    '= = ' * 10,
                    '[myapp CacheManager.cache_dataset] found dataset has been cached'
                )
                return self.cache_pools[cache_id]
            else:
                host = cache_conf['host']
                port = cache_conf.get('port', 3306)
                db = cache_conf['db']
                user = cache_conf['user']
                password = cache_conf.get('password', '')

                table_name = cache_conf['tableName']
                key_name = cache_conf['keyName']
                cache_key_name_list = cache_conf['cache.keyName.list']

                cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name
                conn = MySQLUtils.get_connection(host=host,
                                                 db=db,
                                                 user=user,
                                                 password=password,
                                                 port=port)
                external_cache = MySQLUtils.query(conn,
                                                  sql_text=cache_sql,
                                                  key_name=key_name)

                cache_broadcast = sc.broadcast(external_cache)

                self.cache_pools[cache_id] = cache_broadcast
                return cache_broadcast
        else:
            print(
                '= = ' * 10, '[myapp CacheManager.cache_dataset] '
                'configuration warning: found cache is not enabled, with broadcast.enabled = ',
                broadcast_enabled)
            return None
Esempio n. 4
0
def get_api_deviceinfo():
    host = '192.168.9.228'
    port = 3306
    db = 'edxapp'
    user = '******'
    password = ''
    #
    table_name = 'api_deviceinfo'
    key_name = 'uuid'
    cache_key_name_list = 'channel, event, uid'
    #
    cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name
    #
    conn = MySQLUtils.get_connection(host=host,
                                     db=db,
                                     user=user,
                                     password=password,
                                     port=port)
    #
    external_cache = MySQLUtils.query(conn,
                                      sql_text=cache_sql,
                                      key_name=key_name)
    return external_cache
    def fun_userinfo_in_rdd_mapPartitions(iter_x, step_conf, cache_conf):
        """
        输入: iter[Row]
        输出: iter[Row]
        """
        from datetime import datetime, timedelta

        DATA_JOIN_KEY = 'date_joined'

        host = cache_conf['host']
        port = cache_conf.get('port', 3306)
        db = cache_conf['db']
        user = cache_conf['user']
        password = cache_conf.get('password', '')

        table_name = cache_conf['tableName']
        key_name = cache_conf['keyName']  # course_id
        cache_key_name_list = cache_conf['cache.keyName.list']

        cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name
        query_sql = cache_sql + ' where ' + key_name + ' = %s'
        # TODO: 支持批量查询
        # batch_query_sql = cache_sql + ' where ' + key_name + " in (?)"

        # TODO: 支持 broadcast 应用启动时cache一份外部缓存
        broadcast_enabled = cache_conf.get('broadcast.enabled', False) \
            if isinstance(cache_conf.get('broadcast.enabled'), bool) else False

        # 增量动态缓存
        cache_id = 'ds_id_' + str(cache_conf['source.id']) + '#cache_id_' + str(cache_conf['cache.id'])
        cache_pool = MySQLUtils.cache_pools.setdefault(cache_id, {})

        conn = MySQLUtils.get_connection(host=host, db=db, user=user, password=password, port=port)

        # 更新属性
        def proc_update(obj, date_joined=None):
            obj[DATA_JOIN_KEY] = date_joined if date_joined else ''

        for row in iter_x:

            obj = row.asDict()

            key_value = obj.get(key_name, '').strip()  # course_id

            # 约定 key_value 唯一,为空
            if str(key_value) == '':
                # 没有关联信息处理: web端日志 uuid 为空,不需要关联 mysql.api_deviceinfo 更新 origin_referer, spam
                # 判断是否更新
                proc_update(obj)
            else:
                if key_value in cache_pool:
                    key_cache = cache_pool[key_value]
                    # 内存缓存中命中
                    proc_update(obj, key_cache[DATA_JOIN_KEY])
                elif broadcast_enabled and key_value in cache_conf.get('broadcast').value:
                    # broadcast 内存命中
                    key_cache = cache_conf.get('broadcast').value[key_value]
                    print('= = ' * 10,
                          '[myapp EnhanceUserInfoProcessor.process.fun_userinfo_in_rdd_mapPartitions] found '
                          'broadcast_cache = ', key_cache)
                    proc_update(obj, key_cache[DATA_JOIN_KEY])
                else:
                    # 如果内存中没有缓存,查询外部缓存
                    external_cache = MySQLUtils.query(
                        conn=conn, sql_text=query_sql, key_name=key_name, sql_args=(key_value,))

                    if len(external_cache) == 0:
                        # 外部缓存也没有关联信息时处理: 设置默认值
                        proc_update(obj)
                    else:
                        # 查到关联信息后处理,先根据关联信息更新字段,再更新课程信息
                        key_cache = external_cache[key_value]
                        proc_update(obj, key_cache[DATA_JOIN_KEY])
                        # 更新内存缓存
                        for k, v in external_cache.iteritems():
                            cache_pool[k] = v
                        external_cache.clear()

            yield Row(**obj)
    def fun_courseinfo_in_rdd_mapPartitions(iter_x, step_conf, cache_conf):
        """
        输入: iter[Row]
        输出: iter[Row]
        """
        from datetime import datetime, timedelta

        COURSE_TYPE_KEY = 'course_type'
        COURSE_OWNER_KEY = 'owner'
        COURSE_STATUS_KEY = 'status'
        COURSE_START_KEY = 'start'
        COURSE_END_KEY = 'end'

        COURSE_PROCESS_KEY = 'course_process'

        host = cache_conf['host']
        port = cache_conf.get('port', 3306)
        db = cache_conf['db']
        user = cache_conf['user']
        password = cache_conf.get('password', '')

        table_name = cache_conf['tableName']
        key_name = cache_conf['keyName']  # course_id
        cache_key_name_list = cache_conf['cache.keyName.list']

        cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name
        print('= = ' * 10,
              '[myapp EnhanceCourseInfoProcessor.process.fun_courseinfo_in_rdd_mapPartitions] cache_sql = ' + cache_sql)
        query_sql = cache_sql + ' where ' + key_name + ' = %s'
        # TODO: 支持批量查询
        batch_query_sql = cache_sql + ' where ' + key_name + " in (?)"

        # TODO: 支持 broadcast 应用启动时cache一份外部缓存
        broadcast_enabled = cache_conf.get('broadcast.enabled', False) \
            if isinstance(cache_conf.get('broadcast.enabled'), bool) else False

        # 增量动态缓存
        cache_id = 'ds_id_' + str(cache_conf['source.id']) + '#cache_id_' + str(cache_conf['cache.id'])
        cache_pool = MySQLUtils.cache_pools.setdefault(cache_id, {})

        conn = MySQLUtils.get_connection(host=host, db=db, user=user, password=password, port=port)

        # def check_process(course_id, course_type, status, start, end, et, course_map):
        #     """
        #     离线计算使用
        #     """
        #     if course_type == "0":
        #         return -1 if (start is None or start > et or status == "-1") else (1 if (end < et or course_map.has_key(course_id)) else 0)
        #     if course_type == "1":
        #         return 1 if status == "-1" else 0
        #     return -1

        def check_process(course_type, course_status, course_start, course_end, check_date):
            """
            实时计算使用
            et: 取处理记录时的 CST 时间
            course_map: 不用判断
            """
            if course_type == "0":
                return -1 \
                    if (course_start is None or course_start > check_date or course_status == "-1") \
                    else (1 if (course_end < check_date) else 0)
            elif course_type == "1":
                return 1 if course_status == "-1" else 0
            else:
                return -1

        # 更新属性
        def proc_update(
                obj,
                course_type=None, course_owner=None, course_status=None, course_start=None, course_end=None,
                check_date=(datetime.utcnow() + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')):
            obj[COURSE_TYPE_KEY] = course_type
            obj[COURSE_OWNER_KEY] = course_owner
            obj[COURSE_STATUS_KEY] = course_status
            obj[COURSE_START_KEY] = course_start
            obj[COURSE_END_KEY] = course_end

            course_process = check_process(course_type, course_status, course_start, course_end, check_date)

            obj[COURSE_PROCESS_KEY] = course_process

        for row in iter_x:

            obj = row.asDict()

            key_value = obj.get(key_name, '').strip()  # course_id

            # 约定 key_value 唯一,为空
            if str(key_value) == '':
                # 没有关联信息处理: web端日志 uuid 为空,不需要关联 mysql.api_deviceinfo 更新 origin_referer, spam
                # 判断是否更新
                proc_update(obj)
            else:
                if key_value in cache_pool:
                    key_cache = cache_pool[key_value]
                    # 内存缓存中命中
                    proc_update(obj,
                                key_cache[COURSE_TYPE_KEY], key_cache[COURSE_OWNER_KEY], key_cache[COURSE_STATUS_KEY],
                                key_cache[COURSE_START_KEY], key_cache[COURSE_END_KEY])
                elif broadcast_enabled and key_value in cache_conf.get('broadcast').value:
                    # broadcast 内存命中
                    key_cache = cache_conf.get('broadcast').value[key_value]
                    print('= = ' * 10,
                          '[myapp EnhanceCourseInfoProcessor.process.fun_courseinfo_in_rdd_mapPartitions] found '
                          'broadcast_cache = ', key_cache)
                    proc_update(obj,
                                key_cache[COURSE_TYPE_KEY], key_cache[COURSE_OWNER_KEY], key_cache[COURSE_STATUS_KEY],
                                key_cache[COURSE_START_KEY], key_cache[COURSE_END_KEY])
                else:
                    # 如果内存中没有缓存,查询外部缓存
                    external_cache = MySQLUtils.query(
                        conn=conn, sql_text=query_sql, key_name=key_name, sql_args=(key_value,))

                    if len(external_cache) == 0:
                        # 外部缓存也没有关联信息时处理: 设置默认值
                        proc_update(obj)
                    else:
                        # 查到关联信息后处理,先根据关联信息更新字段,再更新课程信息
                        key_cache = external_cache[key_value]
                        proc_update(obj,
                                    key_cache[COURSE_TYPE_KEY], key_cache[COURSE_OWNER_KEY], key_cache[COURSE_STATUS_KEY],
                                    key_cache[COURSE_START_KEY], key_cache[COURSE_END_KEY])
                        # 更新内存缓存
                        for k, v in external_cache.iteritems():
                            cache_pool[k] = v
                        external_cache.clear()

            yield Row(**obj)
    def fun_deviceinfo_in_rdd_mapPartitions(iter_x, step_conf, cache_conf):
        """
        输入: iter[Row]
        输出: iter[Row]
        """
        import re

        domain_pattern = re.compile('https?://(.*?)/.*')
        ORIGIN_REFERER_KEY = 'origin_referer'
        CHANNEL_KEY = 'channel'
        UNKNOWN_ORIGIN_REFERER_VALUE = 'unknown'

        USER_ID_KEY = 'user_id'
        UID_KEY = 'uid'

        SPAM_KEY = 'spam'
        EVENT_KEY = 'event'
        UNKNOWN_SPAM_VALUE = 'unknown'


        host = cache_conf['host']
        port = cache_conf.get('port', 3306)
        db = cache_conf['db']
        user = cache_conf['user']
        password = cache_conf.get('password', '')

        table_name = cache_conf['tableName']
        key_name = cache_conf['keyName']
        cache_key_name_list = cache_conf['cache.keyName.list']

        cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name
        query_sql = cache_sql + ' where ' + key_name + ' = %s'
        # TODO: 支持批量查询
        batch_query_sql = cache_sql + ' where ' + key_name + " in (?)"

        # TODO: 支持 broadcast 应用启动时cache一份外部缓存
        broadcast_enabled = cache_conf.get('broadcast.enabled', False) \
            if isinstance(cache_conf.get('broadcast.enabled'), bool) else False

        # 增量动态缓存
        cache_id = 'ds_id_' + str(cache_conf['source.id']) + '#cache_id_' + str(cache_conf['cache.id'])
        cache_pool = MySQLUtils.cache_pools.setdefault(cache_id, {})

        conn = MySQLUtils.get_connection(host=host, db=db, user=user, password=password, port=port)

        # 更新属性
        # 1 更新 origin_referer 规则:如果日志中 origin_referer 为空(null or “”)且设备信息channel字段不为空,
        #   取设备信息的channel字段值,其他情况取日志中 origin_referer 字段
        #   Note: origin_referer 暂取 domain
        # 2 更新 spam 规则:如果用户日志中 spam 为空(null or ””)且设备信息event字段不为空,
        #   取设备信息的event,其他情况取用户日志中 spam 的值
        # 3 更新 user_id 规则: 如果关联到缓存uid,更新;日志中的 user_id 可能不是有效的

        def proc_update(obj, cache_channel=None, cache_event=None, cache_uid=None):
            if not obj[ORIGIN_REFERER_KEY] and cache_channel:
                obj[ORIGIN_REFERER_KEY] = cache_channel

            if obj[ORIGIN_REFERER_KEY]:
                match_result = re.match(domain_pattern, obj[ORIGIN_REFERER_KEY])
                if match_result:
                    obj[ORIGIN_REFERER_KEY] = match_result.group(1)

            if not obj[ORIGIN_REFERER_KEY]:
                obj[ORIGIN_REFERER_KEY] = UNKNOWN_ORIGIN_REFERER_VALUE

            if not obj[SPAM_KEY] and cache_event:
                obj[SPAM_KEY] = cache_event

            if not obj[SPAM_KEY]:
                obj[SPAM_KEY] = UNKNOWN_SPAM_VALUE

            if cache_uid and obj[USER_ID_KEY] != cache_uid:
                obj[USER_ID_KEY] = cache_uid

        for row in iter_x:
            obj = row.asDict()

            key_value = obj.get(key_name, '').strip()
            print('= = ' * 10,
                  '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found key_value =',
                  key_value, ', obj = ', obj)

            # 约定 key_value 唯一,为空
            if str(key_value) == '':
                # 没有关联信息处理: web端日志 uuid 为空,不需要关联 mysql.api_deviceinfo 更新 origin_referer, spam
                # 判断是否更新
                proc_update(obj)
            else:
                # 先检查内存缓存,再检查broadcast,最后检查外部缓存
                if key_value in cache_pool and isinstance(cache_pool[key_value], dict):
                    key_cache = cache_pool[key_value]
                    # 内存缓存中命中
                    proc_update(obj, key_cache[CHANNEL_KEY], key_cache[EVENT_KEY], key_cache[UID_KEY])

                elif broadcast_enabled and key_value in cache_conf.get('broadcast').value:
                    # broadcast 内存命中
                    key_cache = cache_conf.get('broadcast').value[key_value]
                    print('= = ' * 10,
                          '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found '
                          'broadcast_cache = ', key_cache)
                    proc_update(obj, key_cache[CHANNEL_KEY], key_cache[EVENT_KEY], key_cache[UID_KEY])
                else:
                    # 如果内存中没有缓存,查询外部缓存
                    external_cache = MySQLUtils.query(
                        conn=conn, sql_text=query_sql, key_name=key_name, sql_args=(key_value,))
                    print('= = ' * 10,
                          '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found '
                          'external_cache = ', external_cache)
                    if len(external_cache) == 0:
                        # 外部缓存也没有关联信息时处理: 根据日志中的 origin_referer, spam 字段信息,更新 origin_referer, spam
                        proc_update(obj)
                    else:
                        # 查到关联信息后处理,先根据关联信息更新字段,再更新 origin_referer, spam
                        key_cache = external_cache[key_value]
                        print('= = ' * 10,
                          '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found '
                          'mysql_cache = ', key_cache)
                        proc_update(obj, key_cache[CHANNEL_KEY], key_cache[EVENT_KEY], key_cache[UID_KEY])
                        # 更新内存缓存
                        for k, v in external_cache.iteritems():
                            cache_pool[k] = v
                        external_cache.clear()

            yield Row(**obj)
Esempio n. 8
0
    def fun_userinfo_in_rdd_mapPartitions(iter_x, step_conf, cache_conf):
        """
        输入: iter[Row]
        输出: iter[Row]
        """
        from datetime import datetime, timedelta

        DATA_JOIN_KEY = 'date_joined'

        host = cache_conf['host']
        port = cache_conf.get('port', 3306)
        db = cache_conf['db']
        user = cache_conf['user']
        password = cache_conf.get('password', '')

        table_name = cache_conf['tableName']
        key_name = cache_conf['keyName']  # course_id
        cache_key_name_list = cache_conf['cache.keyName.list']

        cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name
        query_sql = cache_sql + ' where ' + key_name + ' = %s'
        # TODO: 支持批量查询
        # batch_query_sql = cache_sql + ' where ' + key_name + " in (?)"

        # TODO: 支持 broadcast 应用启动时cache一份外部缓存
        broadcast_enabled = cache_conf.get('broadcast.enabled', False) \
            if isinstance(cache_conf.get('broadcast.enabled'), bool) else False

        # 增量动态缓存
        cache_id = 'ds_id_' + str(
            cache_conf['source.id']) + '#cache_id_' + str(
                cache_conf['cache.id'])
        cache_pool = MySQLUtils.cache_pools.setdefault(cache_id, {})

        conn = MySQLUtils.get_connection(host=host,
                                         db=db,
                                         user=user,
                                         password=password,
                                         port=port)

        # 更新属性
        def proc_update(obj, date_joined=None):
            obj[DATA_JOIN_KEY] = date_joined if date_joined else ''

        for row in iter_x:

            obj = row.asDict()

            key_value = obj.get(key_name, '').strip()  # course_id

            # 约定 key_value 唯一,为空
            if str(key_value) == '':
                # 没有关联信息处理: web端日志 uuid 为空,不需要关联 mysql.api_deviceinfo 更新 origin_referer, spam
                # 判断是否更新
                proc_update(obj)
            else:
                if key_value in cache_pool:
                    key_cache = cache_pool[key_value]
                    # 内存缓存中命中
                    proc_update(obj, key_cache[DATA_JOIN_KEY])
                elif broadcast_enabled and key_value in cache_conf.get(
                        'broadcast').value:
                    # broadcast 内存命中
                    key_cache = cache_conf.get('broadcast').value[key_value]
                    print(
                        '= = ' * 10,
                        '[myapp EnhanceUserInfoProcessor.process.fun_userinfo_in_rdd_mapPartitions] found '
                        'broadcast_cache = ', key_cache)
                    proc_update(obj, key_cache[DATA_JOIN_KEY])
                else:
                    # 如果内存中没有缓存,查询外部缓存
                    external_cache = MySQLUtils.query(conn=conn,
                                                      sql_text=query_sql,
                                                      key_name=key_name,
                                                      sql_args=(key_value, ))

                    if len(external_cache) == 0:
                        # 外部缓存也没有关联信息时处理: 设置默认值
                        proc_update(obj)
                    else:
                        # 查到关联信息后处理,先根据关联信息更新字段,再更新课程信息
                        key_cache = external_cache[key_value]
                        proc_update(obj, key_cache[DATA_JOIN_KEY])
                        # 更新内存缓存
                        for k, v in external_cache.iteritems():
                            cache_pool[k] = v
                        external_cache.clear()

            yield Row(**obj)
Esempio n. 9
0
    def fun_courseinfo_in_rdd_mapPartitions(iter_x, step_conf, cache_conf):
        """
        输入: iter[Row]
        输出: iter[Row]
        """
        from datetime import datetime, timedelta

        COURSE_TYPE_KEY = 'course_type'
        COURSE_OWNER_KEY = 'owner'
        COURSE_STATUS_KEY = 'status'
        COURSE_START_KEY = 'start'
        COURSE_END_KEY = 'end'

        COURSE_PROCESS_KEY = 'course_process'

        host = cache_conf['host']
        port = cache_conf.get('port', 3306)
        db = cache_conf['db']
        user = cache_conf['user']
        password = cache_conf.get('password', '')

        table_name = cache_conf['tableName']
        key_name = cache_conf['keyName']  # course_id
        cache_key_name_list = cache_conf['cache.keyName.list']

        cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name
        print(
            '= = ' * 10,
            '[myapp EnhanceCourseInfoProcessor.process.fun_courseinfo_in_rdd_mapPartitions] cache_sql = '
            + cache_sql)
        query_sql = cache_sql + ' where ' + key_name + ' = %s'
        # TODO: 支持批量查询
        batch_query_sql = cache_sql + ' where ' + key_name + " in (?)"

        # TODO: 支持 broadcast 应用启动时cache一份外部缓存
        broadcast_enabled = cache_conf.get('broadcast.enabled', False) \
            if isinstance(cache_conf.get('broadcast.enabled'), bool) else False

        # 增量动态缓存
        cache_id = 'ds_id_' + str(
            cache_conf['source.id']) + '#cache_id_' + str(
                cache_conf['cache.id'])
        cache_pool = MySQLUtils.cache_pools.setdefault(cache_id, {})

        conn = MySQLUtils.get_connection(host=host,
                                         db=db,
                                         user=user,
                                         password=password,
                                         port=port)

        # def check_process(course_id, course_type, status, start, end, et, course_map):
        #     """
        #     离线计算使用
        #     """
        #     if course_type == "0":
        #         return -1 if (start is None or start > et or status == "-1") else (1 if (end < et or course_map.has_key(course_id)) else 0)
        #     if course_type == "1":
        #         return 1 if status == "-1" else 0
        #     return -1

        def check_process(course_type, course_status, course_start, course_end,
                          check_date):
            """
            实时计算使用
            et: 取处理记录时的 CST 时间
            course_map: 不用判断
            """
            if course_type == "0":
                return -1 \
                    if (course_start is None or course_start > check_date or course_status == "-1") \
                    else (1 if (course_end < check_date) else 0)
            elif course_type == "1":
                return 1 if course_status == "-1" else 0
            else:
                return -1

        # 更新属性
        def proc_update(
            obj,
            course_type=None,
            course_owner=None,
            course_status=None,
            course_start=None,
            course_end=None,
            check_date=(datetime.utcnow() +
                        timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')):
            obj[COURSE_TYPE_KEY] = course_type
            obj[COURSE_OWNER_KEY] = course_owner
            obj[COURSE_STATUS_KEY] = course_status
            obj[COURSE_START_KEY] = course_start
            obj[COURSE_END_KEY] = course_end

            course_process = check_process(course_type, course_status,
                                           course_start, course_end,
                                           check_date)

            obj[COURSE_PROCESS_KEY] = course_process

        for row in iter_x:

            obj = row.asDict()

            key_value = obj.get(key_name, '').strip()  # course_id

            # 约定 key_value 唯一,为空
            if str(key_value) == '':
                # 没有关联信息处理: web端日志 uuid 为空,不需要关联 mysql.api_deviceinfo 更新 origin_referer, spam
                # 判断是否更新
                proc_update(obj)
            else:
                if key_value in cache_pool:
                    key_cache = cache_pool[key_value]
                    # 内存缓存中命中
                    proc_update(obj, key_cache[COURSE_TYPE_KEY],
                                key_cache[COURSE_OWNER_KEY],
                                key_cache[COURSE_STATUS_KEY],
                                key_cache[COURSE_START_KEY],
                                key_cache[COURSE_END_KEY])
                elif broadcast_enabled and key_value in cache_conf.get(
                        'broadcast').value:
                    # broadcast 内存命中
                    key_cache = cache_conf.get('broadcast').value[key_value]
                    print(
                        '= = ' * 10,
                        '[myapp EnhanceCourseInfoProcessor.process.fun_courseinfo_in_rdd_mapPartitions] found '
                        'broadcast_cache = ', key_cache)
                    proc_update(obj, key_cache[COURSE_TYPE_KEY],
                                key_cache[COURSE_OWNER_KEY],
                                key_cache[COURSE_STATUS_KEY],
                                key_cache[COURSE_START_KEY],
                                key_cache[COURSE_END_KEY])
                else:
                    # 如果内存中没有缓存,查询外部缓存
                    external_cache = MySQLUtils.query(conn=conn,
                                                      sql_text=query_sql,
                                                      key_name=key_name,
                                                      sql_args=(key_value, ))

                    if len(external_cache) == 0:
                        # 外部缓存也没有关联信息时处理: 设置默认值
                        proc_update(obj)
                    else:
                        # 查到关联信息后处理,先根据关联信息更新字段,再更新课程信息
                        key_cache = external_cache[key_value]
                        proc_update(obj, key_cache[COURSE_TYPE_KEY],
                                    key_cache[COURSE_OWNER_KEY],
                                    key_cache[COURSE_STATUS_KEY],
                                    key_cache[COURSE_START_KEY],
                                    key_cache[COURSE_END_KEY])
                        # 更新内存缓存
                        for k, v in external_cache.iteritems():
                            cache_pool[k] = v
                        external_cache.clear()

            yield Row(**obj)
Esempio n. 10
0
    def fun_deviceinfo_in_rdd_mapPartitions(iter_x, step_conf, cache_conf):
        """
        输入: iter[Row]
        输出: iter[Row]
        """
        import re

        domain_pattern = re.compile('https?://(.*?)/.*')
        ORIGIN_REFERER_KEY = 'origin_referer'
        CHANNEL_KEY = 'channel'
        UNKNOWN_ORIGIN_REFERER_VALUE = 'unknown'

        USER_ID_KEY = 'user_id'
        UID_KEY = 'uid'

        SPAM_KEY = 'spam'
        EVENT_KEY = 'event'
        UNKNOWN_SPAM_VALUE = 'unknown'

        host = cache_conf['host']
        port = cache_conf.get('port', 3306)
        db = cache_conf['db']
        user = cache_conf['user']
        password = cache_conf.get('password', '')

        table_name = cache_conf['tableName']
        key_name = cache_conf['keyName']
        cache_key_name_list = cache_conf['cache.keyName.list']

        cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name
        query_sql = cache_sql + ' where ' + key_name + ' = %s'
        # TODO: 支持批量查询
        batch_query_sql = cache_sql + ' where ' + key_name + " in (?)"

        # TODO: 支持 broadcast 应用启动时cache一份外部缓存
        broadcast_enabled = cache_conf.get('broadcast.enabled', False) \
            if isinstance(cache_conf.get('broadcast.enabled'), bool) else False

        # 增量动态缓存
        cache_id = 'ds_id_' + str(
            cache_conf['source.id']) + '#cache_id_' + str(
                cache_conf['cache.id'])
        cache_pool = MySQLUtils.cache_pools.setdefault(cache_id, {})

        conn = MySQLUtils.get_connection(host=host,
                                         db=db,
                                         user=user,
                                         password=password,
                                         port=port)

        # 更新属性
        # 1 更新 origin_referer 规则:如果日志中 origin_referer 为空(null or “”)且设备信息channel字段不为空,
        #   取设备信息的channel字段值,其他情况取日志中 origin_referer 字段
        #   Note: origin_referer 暂取 domain
        # 2 更新 spam 规则:如果用户日志中 spam 为空(null or ””)且设备信息event字段不为空,
        #   取设备信息的event,其他情况取用户日志中 spam 的值
        # 3 更新 user_id 规则: 如果关联到缓存uid,更新;日志中的 user_id 可能不是有效的

        def proc_update(obj,
                        cache_channel=None,
                        cache_event=None,
                        cache_uid=None):
            if not obj[ORIGIN_REFERER_KEY] and cache_channel:
                obj[ORIGIN_REFERER_KEY] = cache_channel

            if obj[ORIGIN_REFERER_KEY]:
                match_result = re.match(domain_pattern,
                                        obj[ORIGIN_REFERER_KEY])
                if match_result:
                    obj[ORIGIN_REFERER_KEY] = match_result.group(1)

            if not obj[ORIGIN_REFERER_KEY]:
                obj[ORIGIN_REFERER_KEY] = UNKNOWN_ORIGIN_REFERER_VALUE

            if not obj[SPAM_KEY] and cache_event:
                obj[SPAM_KEY] = cache_event

            if not obj[SPAM_KEY]:
                obj[SPAM_KEY] = UNKNOWN_SPAM_VALUE

            if cache_uid and obj[USER_ID_KEY] != cache_uid:
                obj[USER_ID_KEY] = cache_uid

        for row in iter_x:
            obj = row.asDict()

            key_value = obj.get(key_name, '').strip()
            print(
                '= = ' * 10,
                '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found key_value =',
                key_value, ', obj = ', obj)

            # 约定 key_value 唯一,为空
            if str(key_value) == '':
                # 没有关联信息处理: web端日志 uuid 为空,不需要关联 mysql.api_deviceinfo 更新 origin_referer, spam
                # 判断是否更新
                proc_update(obj)
            else:
                # 先检查内存缓存,再检查broadcast,最后检查外部缓存
                if key_value in cache_pool and isinstance(
                        cache_pool[key_value], dict):
                    key_cache = cache_pool[key_value]
                    # 内存缓存中命中
                    proc_update(obj, key_cache[CHANNEL_KEY],
                                key_cache[EVENT_KEY], key_cache[UID_KEY])

                elif broadcast_enabled and key_value in cache_conf.get(
                        'broadcast').value:
                    # broadcast 内存命中
                    key_cache = cache_conf.get('broadcast').value[key_value]
                    print(
                        '= = ' * 10,
                        '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found '
                        'broadcast_cache = ', key_cache)
                    proc_update(obj, key_cache[CHANNEL_KEY],
                                key_cache[EVENT_KEY], key_cache[UID_KEY])
                else:
                    # 如果内存中没有缓存,查询外部缓存
                    external_cache = MySQLUtils.query(conn=conn,
                                                      sql_text=query_sql,
                                                      key_name=key_name,
                                                      sql_args=(key_value, ))
                    print(
                        '= = ' * 10,
                        '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found '
                        'external_cache = ', external_cache)
                    if len(external_cache) == 0:
                        # 外部缓存也没有关联信息时处理: 根据日志中的 origin_referer, spam 字段信息,更新 origin_referer, spam
                        proc_update(obj)
                    else:
                        # 查到关联信息后处理,先根据关联信息更新字段,再更新 origin_referer, spam
                        key_cache = external_cache[key_value]
                        print(
                            '= = ' * 10,
                            '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found '
                            'mysql_cache = ', key_cache)
                        proc_update(obj, key_cache[CHANNEL_KEY],
                                    key_cache[EVENT_KEY], key_cache[UID_KEY])
                        # 更新内存缓存
                        for k, v in external_cache.iteritems():
                            cache_pool[k] = v
                        external_cache.clear()

            yield Row(**obj)