Exemple #1
0
def filter_thread(threadId, options):
    """
    数据清洗线程
    :param threadId: 线程序号
    :param options: 程序参数
    """
    global es, cacheIds, cache, threadExit, threadLock, processCount

    # 加载插件列表
    plugins = Plugin.loadPlugins(options.rootdir, options.debug)
    output('Thread {}: Plugins loaded.'.format(threadId), LogLevel.INFO)

    if len(plugins) == 0: return

    #es = Elasticsearch(hosts=options.hosts)
    while True:
        if threadExit: break

        try:
            threadLock.acquire()
            data = search_by_time(es, options.index + '*', time_range=options.range, size=options.batch_size, mode=options.mode)
            threadLock.release()

            if not data:
                output('Thread {}: No new msg, waiting 2s ...'.format(threadId), LogLevel.DEBUG)
                time.sleep(2)
                if threadExit: break
                continue

            # 更新ES文档中的内容为正在处理状态
            actions = []
            for i in range(len(data)-1, -1, -1):
                # 处理过的ID缓存下来,避免在多个线程间重复处理数据
                existed = cacheIds.get(data[i]['_id'])
                if existed:
                    del(data[i])
                    continue

                cacheIds.set(data[i]['_id'], True)

                if 'ip' not in data[i]['_source'] or 'port' not in data[i]['_source'] or 'pro' not in data[i]['_source']:
                    del(data[i])
                    continue

                actions.append({
                    '_op_type': 'update', 
                    '_index': data[i]['_index'],
                    '_type': data[i]['_type'],
                    '_id': data[i]['_id'],
                    'doc': { 'state': MsgState.PROGRESSING }
                })
            
            if len(actions) == 0:
                time.sleep(1)
                if threadExit: break
                continue

            conflict_list = batch_update(es, actions)
            threadLock.acquire()
            processCount += len(data)
            threadLock.release()
            
            actions = []
            while True:
                if not data: break
                item = data.pop()
                # 冲突或已处理的直接跳过
                if item['_id'] in conflict_list: continue
                
                msg = item['_source']
                # 通过 Cache 降低插件的处理频率
                cache_key = '{}:{}'.format(msg['ip'], msg['port'])
                if msg['pro'] == 'HTTP':
                    cache_key = msg['url']

                cacheMsg = cache.get(cache_key)
                if cacheMsg:
                    output('Thread {}: Use cached result, key={}'.format(threadId, cache_key), LogLevel.DEBUG)
                    actions.append({
                        '_type': item['_type'],
                        '_op_type': 'update', 
                        '_index': item['_index'],
                        '_id': item['_id'],
                        'doc': cacheMsg
                    })
                    continue

                msg_update = {}
                # 按插件顺序对数据进行处理(插件顺序在配置文件中定义)
                stime = time.time()
                for i in sorted(plugins.keys()):
                    (pluginName, plugin) = plugins[i]
                    output('Thread {}: Plugin {} processing ...'.format(threadId, pluginName), LogLevel.DEBUG)

                    try:
                        ret = plugin.execute(msg)
                        if ret:
                            msg_update = dict(msg_update, **ret)
                            msg = dict(msg, **ret)
                    except:
                        output(traceback.format_exc(), LogLevel.ERROR)
                    
                    output('Thread {}: Plugin {} completed.'.format(threadId, pluginName), LogLevel.DEBUG)
                
                output("Elapsed time: {}".format(time.time() - stime), LogLevel.DEBUG)
                # 更新数据
                msg_update['state'] = MsgState.COMPLETED
                cache.set(cache_key, msg_update)

                actions.append({
                    '_type': item['_type'],
                    '_op_type': 'update', 
                    '_index': item['_index'],
                    '_id': item['_id'],
                    'doc': msg_update
                })

            # 提交到 ES
            if len(actions) > 0:
                output('Thread {}: Batch update {} document.'.format(threadId, len(actions)), LogLevel.INFO)
                output('Thread {}: {}'.format(threadId, json.dumps(actions)), LogLevel.DEBUG)
                batch_update(es, actions)
                actions = []

        except:
            output(traceback.format_exc(), LogLevel.ERROR)