class Pipeline(): ''' 存储结果管道 (不负责压next task) ''' def __init__(self, task): self.task = task self.taskUtil = TaskUtil() self.mongoUtil = MongoUtil() pass def run(self): ''' 分发 :return: 无 ''' Log.i('Pipeline.run()') if self.task['results'] is not None and len(self.task['results']) > 0: #下次任务入队列 if self.task['next_tasks'] is not None: for next_task in self.task['next_tasks']: self.taskUtil.insert_one(next_task) #本次解析结果入库 # 利用反射机制自动执行pipeline_<parser名>()函数,如果找不到则执行默认的pipeline_default()函数 if hasattr(self, 'pipeline_' + self.task['parser']): func = getattr(self, 'pipeline_' + self.task['parser']) func(self.task['table']) else: self.pipeline_default(self.task['table']) #将完整task存入mongo,并将本条task self.task['state'] = 'done' self.taskUtil.replace_one(self.task['_id'], self.task) else: #没有解析出结果,则表示中间出错了,等待下次再启动 pass Log.i('this task is finished') def pipeline_default(self, collection_name): ''' 存储demo demo_info { '_id':'http://tieba.baidu.com', 'name':'百度贴吧' } :return: ''' if self.task['parent'] is None: self.task['parent'] = {} if self.task['results'] is not None: for result in self.task['results']: insert_data = dict(self.task['parent'], **result) self.mongoUtil.insert(collection_name=collection_name, insert_data=insert_data) pass def __del__(self): self.mongoUtil.close_conn()
class InitUtil: ''' 初始化,清空所有数据,重新开始新一轮任务 ''' def __init__(self): self.mongoUtil = MongoUtil() pass def init(self): db = Setting.MONGO_DB self.mongoUtil.clear_all(db) def __del__(self): self.mongoUtil.close_conn()
class TaskUtil: ''' Task操作工具 tasks { "parser": "phase" "request": "http://so.eduyun.cn/synResource", "response": "<html>...</html>" "parent": {}, "state": "done", "uptime": } ''' def __init__(self, request): self.mongoUtil = MongoUtil() count = self.mongoUtil.count(collection_name='tasks') if count == 0: task = None task_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), '..', 'conf', 'task.json') f = open(task_file, encoding='utf-8') task = f.read() f.close() insert_data = json.loads(task) self.mongoUtil.insert(collection_name='tasks', insert_data=insert_data) def get_ready(self): ''' 获取一条待执行的任务(准备状态),并置为doing状态 :return: dict 单条任务 ''' # 过滤条件,不存在state字段或state=ready filter_dict = { '$or': [{ 'state': { '$exists': False } }, { 'state': 'ready' }] } # 更新条件,将state=doing update_dict = {'$set': {'state': 'doing'}} # 执行mongo操作 task = self.mongoUtil.find_one_and_update(collection_name='tasks', filter_dict=filter_dict, update_dict=update_dict) return task def set_state(self, id, state): ''' 设置任务状态(ready,doing,done) :param id: str 主键id :param state: str 更新状态值 :return: 无 ''' filter_dict = {'_id': id} update_dict = {'$set': {'state': state}} self.mongoUtil.update(collection_name='tasks', filter_dict=filter_dict, update_dict=update_dict) def replace_one(self, id, task): ''' 更新整个任务 :param id: str :param task: dict :return: 无 ''' filter_dict = {'_id': id} r = self.mongoUtil.find_one_and_replace(collection_name='tasks', filter_dict=filter_dict, replace_dict=task) return r def insert_one(self, task): ''' 插入一条task :param parser: str 解析器 :param request: str 请求的url :return: 无 ''' r = self.mongoUtil.find_one(collection_name='tasks', filter_dict=task) if r is None: task['state'] = 'ready' r = self.mongoUtil.insert(collection_name='tasks', insert_data=task) def __del__(self): self.mongoUtil.close_conn()
class TaskUtil: ''' Task操作工具 tasks { "parser": "phase" "request": "http://so.eduyun.cn/synResource", "response": "<html>...</html>" "parent": {}, "state": "done", "uptime": } ''' def __init__(self): self.mongoUtil = MongoUtil() count = self.mongoUtil.count(collection_name='tasks') if count == 0: first_task_parser = Setting.FIRST_TASK_PARSER first_task_url = Setting.FIRST_TASK_URL first_task_table = Setting.FIRST_TASK_TABLE insert_data = { "parser": first_task_parser, "request": first_task_url, "table": first_task_table, "parent": {}, "state": "ready" } self.mongoUtil.insert(collection_name='tasks', insert_data=insert_data) def get_ready(self): ''' 获取一条待执行的任务(准备状态),并置为doing状态 :return: dict 单条任务 ''' # 过滤条件,不存在state字段或state=ready filter_dict = { '$or': [{ 'state': { '$exists': False } }, { 'state': 'ready' }] } # 更新条件,将state=doing update_dict = {'$set': {'state': 'doing'}} # 执行mongo操作 task = self.mongoUtil.find_one_and_update(collection_name='tasks', filter_dict=filter_dict, update_dict=update_dict) return task def set_state(self, id, state): ''' 设置任务状态(ready,doing,done) :param id: str 主键id :param state: str 更新状态值 :return: 无 ''' filter_dict = {'_id': id} update_dict = {'$set': {'state': state}} self.mongoUtil.update(collection_name='tasks', filter_dict=filter_dict, update_dict=update_dict) def replace_one(self, id, task): ''' 更新整个任务 :param id: str :param task: dict :return: 无 ''' filter_dict = {'_id': id} r = self.mongoUtil.find_one_and_replace(collection_name='tasks', filter_dict=filter_dict, replace_dict=task) return r def insert_one(self, task): ''' 插入一条task :param parser: str 解析器 :param request: str 请求的url :return: 无 ''' r = self.mongoUtil.find_one(collection_name='tasks', filter_dict=task) if r is None: task['state'] = 'ready' r = self.mongoUtil.insert(collection_name='tasks', insert_data=task) def __del__(self): self.mongoUtil.close_conn()