Exemple #1
0
class Pipeline():
    '''
    存储结果管道
    (不负责压next task)
    '''
    def __init__(self, task):
        self.task = task
        self.taskUtil = TaskUtil()
        self.mongoUtil = MongoUtil()
        pass

    def run(self):
        '''
        分发
        :return: 无
        '''
        Log.i('Pipeline.run()')
        if self.task['results'] is not None and len(self.task['results']) > 0:
            #下次任务入队列
            if self.task['next_tasks'] is not None:
                for next_task in self.task['next_tasks']:
                    self.taskUtil.insert_one(next_task)
            #本次解析结果入库
            # 利用反射机制自动执行pipeline_<parser名>()函数,如果找不到则执行默认的pipeline_default()函数
            if hasattr(self, 'pipeline_' + self.task['parser']):
                func = getattr(self, 'pipeline_' + self.task['parser'])
                func(self.task['table'])
            else:
                self.pipeline_default(self.task['table'])
            #将完整task存入mongo,并将本条task
            self.task['state'] = 'done'
            self.taskUtil.replace_one(self.task['_id'], self.task)
        else:
            #没有解析出结果,则表示中间出错了,等待下次再启动
            pass
        Log.i('this task is finished')

    def pipeline_default(self, collection_name):
        '''
        存储demo
        demo_info
        {
            '_id':'http://tieba.baidu.com',
            'name':'百度贴吧'
        }
        :return:
        '''
        if self.task['parent'] is None:
            self.task['parent'] = {}
        if self.task['results'] is not None:
            for result in self.task['results']:
                insert_data = dict(self.task['parent'], **result)
                self.mongoUtil.insert(collection_name=collection_name,
                                      insert_data=insert_data)
        pass

    def __del__(self):
        self.mongoUtil.close_conn()
Exemple #2
0
class InitUtil:
    '''
    初始化,清空所有数据,重新开始新一轮任务
    '''
    def __init__(self):
        self.mongoUtil = MongoUtil()
        pass

    def init(self):
        db = Setting.MONGO_DB
        self.mongoUtil.clear_all(db)

    def __del__(self):
        self.mongoUtil.close_conn()
class TaskUtil:
    '''
    Task操作工具
    tasks
    {
        "parser": "phase"
        "request": "http://so.eduyun.cn/synResource",
        "response": "<html>...</html>"
        "parent": {},
        "state": "done",
        "uptime":
    }
    '''
    def __init__(self, request):
        self.mongoUtil = MongoUtil()
        count = self.mongoUtil.count(collection_name='tasks')
        if count == 0:
            task = None
            task_file = os.path.join(
                os.path.dirname(os.path.abspath(__file__)), '..', 'conf',
                'task.json')
            f = open(task_file, encoding='utf-8')
            task = f.read()
            f.close()
            insert_data = json.loads(task)
            self.mongoUtil.insert(collection_name='tasks',
                                  insert_data=insert_data)

    def get_ready(self):
        '''
        获取一条待执行的任务(准备状态),并置为doing状态
        :return: dict 单条任务
        '''
        # 过滤条件,不存在state字段或state=ready
        filter_dict = {
            '$or': [{
                'state': {
                    '$exists': False
                }
            }, {
                'state': 'ready'
            }]
        }
        # 更新条件,将state=doing
        update_dict = {'$set': {'state': 'doing'}}
        # 执行mongo操作
        task = self.mongoUtil.find_one_and_update(collection_name='tasks',
                                                  filter_dict=filter_dict,
                                                  update_dict=update_dict)
        return task

    def set_state(self, id, state):
        '''
        设置任务状态(ready,doing,done)
        :param id: str 主键id
        :param state: str 更新状态值
        :return: 无
        '''
        filter_dict = {'_id': id}
        update_dict = {'$set': {'state': state}}
        self.mongoUtil.update(collection_name='tasks',
                              filter_dict=filter_dict,
                              update_dict=update_dict)

    def replace_one(self, id, task):
        '''
        更新整个任务
        :param id: str
        :param task: dict
        :return: 无
        '''
        filter_dict = {'_id': id}
        r = self.mongoUtil.find_one_and_replace(collection_name='tasks',
                                                filter_dict=filter_dict,
                                                replace_dict=task)
        return r

    def insert_one(self, task):
        '''
        插入一条task
        :param parser: str 解析器
        :param request: str 请求的url
        :return: 无
        '''
        r = self.mongoUtil.find_one(collection_name='tasks', filter_dict=task)
        if r is None:
            task['state'] = 'ready'
            r = self.mongoUtil.insert(collection_name='tasks',
                                      insert_data=task)

    def __del__(self):
        self.mongoUtil.close_conn()
Exemple #4
0
class TaskUtil:
    '''
    Task操作工具
    tasks
    {
        "parser": "phase"
        "request": "http://so.eduyun.cn/synResource",
        "response": "<html>...</html>"
        "parent": {},
        "state": "done",
        "uptime":
    }
    '''
    def __init__(self):
        self.mongoUtil = MongoUtil()
        count = self.mongoUtil.count(collection_name='tasks')
        if count == 0:
            first_task_parser = Setting.FIRST_TASK_PARSER
            first_task_url = Setting.FIRST_TASK_URL
            first_task_table = Setting.FIRST_TASK_TABLE
            insert_data = {
                "parser": first_task_parser,
                "request": first_task_url,
                "table": first_task_table,
                "parent": {},
                "state": "ready"
            }
            self.mongoUtil.insert(collection_name='tasks',
                                  insert_data=insert_data)

    def get_ready(self):
        '''
        获取一条待执行的任务(准备状态),并置为doing状态
        :return: dict 单条任务
        '''
        # 过滤条件,不存在state字段或state=ready
        filter_dict = {
            '$or': [{
                'state': {
                    '$exists': False
                }
            }, {
                'state': 'ready'
            }]
        }
        # 更新条件,将state=doing
        update_dict = {'$set': {'state': 'doing'}}
        # 执行mongo操作
        task = self.mongoUtil.find_one_and_update(collection_name='tasks',
                                                  filter_dict=filter_dict,
                                                  update_dict=update_dict)
        return task

    def set_state(self, id, state):
        '''
        设置任务状态(ready,doing,done)
        :param id: str 主键id
        :param state: str 更新状态值
        :return: 无
        '''
        filter_dict = {'_id': id}
        update_dict = {'$set': {'state': state}}
        self.mongoUtil.update(collection_name='tasks',
                              filter_dict=filter_dict,
                              update_dict=update_dict)

    def replace_one(self, id, task):
        '''
        更新整个任务
        :param id: str
        :param task: dict
        :return: 无
        '''
        filter_dict = {'_id': id}
        r = self.mongoUtil.find_one_and_replace(collection_name='tasks',
                                                filter_dict=filter_dict,
                                                replace_dict=task)
        return r

    def insert_one(self, task):
        '''
        插入一条task
        :param parser: str 解析器
        :param request: str 请求的url
        :return: 无
        '''
        r = self.mongoUtil.find_one(collection_name='tasks', filter_dict=task)
        if r is None:
            task['state'] = 'ready'
            r = self.mongoUtil.insert(collection_name='tasks',
                                      insert_data=task)

    def __del__(self):
        self.mongoUtil.close_conn()