Exemple #1
0
 def count_jobs_by_account_id(cls, account_id, status='running'):
     if status:
         return db_session.query(Job).filter(Job.account == account_id,
                                             Job.status == status).count()
     else:
         return db_session.query(Job).filter(
             Job.account == account_id).count()
Exemple #2
0
    def select_by_parameter(cls, parameter):

        page = int(parameter['page'])
        limit = int(parameter['limit'])
        status = int(parameter['status'])
        keyword = str(parameter['keyword'])
        sort = int(parameter['sort'])
        try:
            datas = db_session.query(MainUrl).filter(
                MainUrl.sort == sort, MainUrl.status == status,
                MainUrl.webSite.like(
                    "%{}%".format(keyword))).limit(limit).offset(
                        (page - 1) * limit)
            count = db_session.query(MainUrl).filter(
                MainUrl.sort == sort, MainUrl.status == status,
                MainUrl.webSite.like("%{}%".format(keyword))).count()

            db_session.close()

            return {
                "code": "200",
                "message": "succeed",
                "data": [item.single_to_dict() for item in datas],
                "count": count
            }

        except (SqlalchemyIntegrityError, PymysqlIntegrityError,
                InvalidRequestError):
            db_session.close()
            return {"code": "404", "message": "fialed", "data": [], "count": 0}
Exemple #3
0
    def update_task_name(cls, parameter):

        spider_name = int(parameter['task_id'])
        main_url_pids = parameter['main_url_pids']
        operation = str(parameter['operation'])
        if main_url_pids != "":
            try:
                if operation == "import":
                    for main_url_pid in eval(main_url_pids):
                        main_url = db_session.query(MainUrl).filter(
                            MainUrl.pid == main_url_pid).first()
                        main_url.spider_name = spider_name
                elif operation == "remove":
                    for main_url_pid in eval(main_url_pids):
                        main_url = db_session.query(MainUrl).filter(
                            MainUrl.pid == main_url_pid).first()
                        main_url.spider_name = 0
                db_session.commit()
                db_session.close()
                return {"code": "200", "message": "更新成功"}

            except (SqlalchemyIntegrityError, PymysqlIntegrityError,
                    InvalidRequestError):
                db_session.close()
                return {"code": "404", "message": "更新失败"}
        else:
            return {"code": "202", "message": "并没有移除数据"}
Exemple #4
0
    def select_all(cls, parameter):

        page = int(parameter['page'])
        limit = int(parameter['limit'])
        sort = int(parameter['sort'])

        try:
            datas = db_session.query(MainUrl).filter(
                MainUrl.sort == sort, MainUrl.spider_name == 0,
                MainUrl.status == 1).limit(limit).offset((page - 1) * limit)
            count = db_session.query(MainUrl).filter(
                MainUrl.sort == sort, MainUrl.spider_name == 0,
                MainUrl.status == 1).count()
            db_session.close()

            return {
                "code": "200",
                "message": "succeed",
                "data": [item.single_to_dict() for item in datas],
                "count": count
            }

        except (SqlalchemyIntegrityError, PymysqlIntegrityError,
                InvalidRequestError):
            db_session.close()
            return {"code": "404", "message": "fialed", "data": [], "count": 0}
Exemple #5
0
 def get_all_need_restart_task(cls):
     """
     主要用于服务器宕机后重新启动时获取所有需要启动的任务,包括pending状态和running状态的
     :return:
     """
     return db_session.query(Task.id, Task.status).filter(
         Task.status.notin_(('succeed', 'failed', 'cancelled'))).all()
Exemple #6
0
    def set_job_by_track_ids(cls, track_ids, values):
        jobs = db_session.query(Job).filter(Job.track_id.in_(track_ids)).all()
        track_ids_copy = track_ids.copy()
        try:
            for job in jobs:
                track_ids.remove(job.track_id)
                value = values.get(job.track_id, {})
                new_status = value.get('status')
                new_result = value.get('result', '')
                new_traceback = value.get('traceback', '')
                if job.status != new_status:
                    # 第一次变成running的时间即启动时间
                    if new_status == 'running':
                        job.start_time = datetime.datetime.now()
                    if new_status in ['succeed', 'failed']:
                        job.end_time = datetime.datetime.now()

                    job.result = new_result
                    job.traceback = new_traceback
                    job.status = new_status
            db_session.commit()
        except:
            logger.exception('set_job_by_track_ids catch exception.')
            db_session.rollback()
            return track_ids_copy
        return track_ids
Exemple #7
0
 def get_all_need_check_task(cls, last_time):
     """
     获取所有需要检查的任务(即状态可能被用户修改的任务)
     :return:
     """
     return db_session.query(Task.id, Task.status, Task.last_update)\
         .filter(and_(Task.status.in_(('pausing', 'running', 'cancelled')), Task.last_update >= last_time)).all()
Exemple #8
0
    def set_job_result(cls, job_id, result):
        job = db_session.query(Job).filter(Job.id == job_id).first()
        if job:
            job.result = result
            db_session.commit()
            return True

        return False
Exemple #9
0
    def set_aps_status(cls, aps_id, status):
        tag = db_session.query(TaskAccountGroup).filter(
            TaskAccountGroup.aps_id == aps_id).first()
        if tag:
            tag.status = status
            db_session.commit()
            return True

        return False
Exemple #10
0
    def set_task_result(cls, task_id, result):
        task = db_session.query(Task).filter(Task.id == task_id).first()
        if task:
            task.result = result
            task.last_update = datetime.datetime.now()
            db_session.commit()
            return True

        return False
Exemple #11
0
    def start_task(cls, parameter):
        spider_name = int(parameter['id'])
        datas = db_session.query(MainUrl).filter(
            MainUrl.spider_name == spider_name, MainUrl.status == 1).all()
        db_session.close()
        parameters = []
        for item in [item.single_to_dict() for item in datas]:
            parameter = {}
            url = item.get("address")
            try:
                rule = item["rule"]
                if rule == None or rule == "null" or rule == "":
                    crawler_info.info(
                        "{} : has no filtering rules, default algorithm acquisition"
                        .format(url))
                    parameter["rule"] = {
                        'filter_rule': '',
                        'selector': 'xpath',
                        'deep_limit': '1',
                        'fields': {
                            'title': '',
                            'author': '',
                            'publishTime': '',
                            'content': ''
                        }
                    }
                else:
                    filter_rule = json.loads(rule)["filter_rule"]
                    if filter_rule and filter_rule != "":
                        rule = json.loads(item["rule"].replace("@", "+"))
                        parameter["rule"] = rule
                    else:
                        parameter["rule"] = rule
            except:
                crawler_info.info(
                    "{} : has no filtering rules, default algorithm acquisition"
                    .format(url))
                parameter["rule"] = {
                    'filter_rule': '',
                    'selector': 'xpath',
                    'deep_limit': '1',
                    'fields': {
                        'title': '',
                        'author': '',
                        'publishTime': '',
                        'content': ''
                    }
                }

            parameter["pid"] = item.get("pid")
            parameter["webSite"] = item.get("webSite")
            parameter["url"] = str(url).strip()
            parameters.append(parameter)
        return parameters
Exemple #12
0
    def get_account_tasks(cls, account_id):
        """
        查询该账号关联的所有任务
        :param account_id:
        :return: 返回所有关联的task id
        """
        tags = db_session.query(TaskAccountGroup).filter(
            TaskAccountGroup.account_id == account_id).all()
        task_ids = []
        for t in tags:
            task_ids.append(t.task_id)

        return task_ids
Exemple #13
0
    def set_job_status(cls, job_id, status):
        job = db_session.query(Job).filter(Job.id == job_id).first()
        if job:
            if job.status != status:
                # 第一次变成running的时间即启动时间
                if status == 'running':
                    job.start_time = datetime.datetime.now()
                if status in ['success', 'failure']:
                    job.end_time = datetime.datetime.now()

                job.status = status
                db_session.commit()
            return True

        return False
Exemple #14
0
    def set_job_by_track_id(cls, track_id, status, result='', traceback=''):
        job = db_session.query(Job).filter(Job.track_id == track_id).first()
        if job:
            if job.status != status:
                # 第一次变成running的时间即启动时间
                if status == 'running':
                    job.start_time = datetime.datetime.now()
                if status in ['succeed', 'failed']:
                    job.end_time = datetime.datetime.now()

            job.result = result
            job.traceback = traceback
            job.status = status
            db_session.commit()
            return True

        return False
Exemple #15
0
 def update_mainurl(cls, parameter):
     pid = parameter['pid']
     mainurl = db_session.query(MainUrl).filter(MainUrl.pid == pid).first()
     try:
         remark = parameter['remark']
         mainurl.remark = remark
         db_session.commit()
         db_session.close()
     except:
         pass
     try:
         status = parameter['status']
         mainurl.status = status
         db_session.commit()
         db_session.close()
     except:
         pass
     try:
         rule = parameter['rule']
         mainurl.rule = rule
         db_session.commit()
         db_session.close()
     except:
         pass
Exemple #16
0
 def get_all_accounts(cls):
     return db_session.query(Account).all()
Exemple #17
0
 def get_jobs_by_task_id(cls, task_id):
     return db_session.query(Job.status).filter(Job.task == task_id).all()
Exemple #18
0
 def add_account_using_counts(cls, account_id):
     acc = db_session.query(Account).filter(
         Account.id == account_id).first()
     if acc:
         acc.using += 1
Exemple #19
0
 def get_all_pausing_task(cls):
     return db_session.query(Task).filter(Task.status == 'pausing').all()
Exemple #20
0
 def get_all_processor(cls):
     res = db_session.query(
         TaskCategory.processor).filter().distinct().all()
     return [r[0] for r in res]
Exemple #21
0
 def get_aps_ids_by_task_id(cls, task_id):
     aps_id = db_session.query(
         Task.aps_id).filter(Task.id == task_id).first()
     if aps_id:
         return aps_id[0]
     return ''
Exemple #22
0
 def get_task_status_apsid(cls, task_id):
     return db_session.query(
         Task.status, Task.aps_id).filter(Task.id == task_id).first()
Exemple #23
0
 def get_task_by_task_id(cls, task_id):
     return db_session.query(Task).filter(Task.id == task_id).first()
Exemple #24
0
 def get_all_failed_task(cls):
     return db_session.query().filter(Task.status == 'failed').all()
Exemple #25
0
 def get_all_tasks(cls):
     return db_session.query(Task).all()
Exemple #26
0
 def get_account(cls, account_id):
     return db_session.query(Account).filter(
         Account.id == account_id).first()
Exemple #27
0
 def delete_one(cls, parameter):
     maininfo = db_session.query(MainUrl).filter(
         MainUrl.pid == parameter["pid"]).first()
     db_session.delete(maininfo)
     db_session.commit()
     db_session.close()
Exemple #28
0
 def set_aps_status_by_task(cls, task_id, status):
     tags = db_session.query(TaskAccountGroup).filter(
         TaskAccountGroup.task_id == task_id).all()
     for tag in tags:
         tag.status = status
Exemple #29
0
 def get_scheduler(cls, scheduler_id):
     return db_session.query(
         Scheduler.mode, Scheduler.interval, Scheduler.start_date,
         Scheduler.end_date).filter(Scheduler.id == scheduler_id).first()
Exemple #30
0
 def get_all_new_task(cls):
     return db_session.query(
         Task.id, Task.status).filter(Task.status == 'new').all()