def filter_thread(threadId, options): """ 数据清洗线程 :param threadId: 线程序号 :param options: 程序参数 """ global es, cacheIds, cache, threadExit, threadLock, processCount # 加载插件列表 plugins = Plugin.loadPlugins(options.rootdir, options.debug) output('Thread {}: Plugins loaded.'.format(threadId), LogLevel.INFO) if len(plugins) == 0: return #es = Elasticsearch(hosts=options.hosts) while True: if threadExit: break try: threadLock.acquire() data = search_by_time(es, options.index + '*', time_range=options.range, size=options.batch_size, mode=options.mode) threadLock.release() if not data: output('Thread {}: No new msg, waiting 2s ...'.format(threadId), LogLevel.DEBUG) time.sleep(2) if threadExit: break continue # 更新ES文档中的内容为正在处理状态 actions = [] for i in range(len(data)-1, -1, -1): # 处理过的ID缓存下来,避免在多个线程间重复处理数据 existed = cacheIds.get(data[i]['_id']) if existed: del(data[i]) continue cacheIds.set(data[i]['_id'], True) if 'ip' not in data[i]['_source'] or 'port' not in data[i]['_source'] or 'pro' not in data[i]['_source']: del(data[i]) continue actions.append({ '_op_type': 'update', '_index': data[i]['_index'], '_type': data[i]['_type'], '_id': data[i]['_id'], 'doc': { 'state': MsgState.PROGRESSING } }) if len(actions) == 0: time.sleep(1) if threadExit: break continue conflict_list = batch_update(es, actions) threadLock.acquire() processCount += len(data) threadLock.release() actions = [] while True: if not data: break item = data.pop() # 冲突或已处理的直接跳过 if item['_id'] in conflict_list: continue msg = item['_source'] # 通过 Cache 降低插件的处理频率 cache_key = '{}:{}'.format(msg['ip'], msg['port']) if msg['pro'] == 'HTTP': cache_key = msg['url'] cacheMsg = cache.get(cache_key) if cacheMsg: output('Thread {}: Use cached result, key={}'.format(threadId, cache_key), LogLevel.DEBUG) actions.append({ '_type': item['_type'], '_op_type': 'update', '_index': item['_index'], '_id': item['_id'], 'doc': cacheMsg }) continue msg_update = {} # 按插件顺序对数据进行处理(插件顺序在配置文件中定义) stime = time.time() for i in sorted(plugins.keys()): (pluginName, plugin) = plugins[i] output('Thread {}: Plugin {} processing ...'.format(threadId, pluginName), LogLevel.DEBUG) try: ret = plugin.execute(msg) if ret: msg_update = dict(msg_update, **ret) msg = dict(msg, **ret) except: output(traceback.format_exc(), LogLevel.ERROR) output('Thread {}: Plugin {} completed.'.format(threadId, pluginName), LogLevel.DEBUG) output("Elapsed time: {}".format(time.time() - stime), LogLevel.DEBUG) # 更新数据 msg_update['state'] = MsgState.COMPLETED cache.set(cache_key, msg_update) actions.append({ '_type': item['_type'], '_op_type': 'update', '_index': item['_index'], '_id': item['_id'], 'doc': msg_update }) # 提交到 ES if len(actions) > 0: output('Thread {}: Batch update {} document.'.format(threadId, len(actions)), LogLevel.INFO) output('Thread {}: {}'.format(threadId, json.dumps(actions)), LogLevel.DEBUG) batch_update(es, actions) actions = [] except: output(traceback.format_exc(), LogLevel.ERROR)