コード例 #1
0
ファイル: views.py プロジェクト: WNCrawl/WNCrawlWeb
def edit_user(request, user_id):
    """
    修改用户
    :param user_id:
    :param request: request object
    :return: json
    """
    if request.method == 'POST':
        data = json.loads(request.body.decode('utf-8'))
        alert_options = data.get('alert_options')

        user = CrawlUser.objects.get(id=user_id)
        user.account = data.get('account')
        user.mobile = data.get('mobile', '')
        user.wx_account = data.get('wx_account')
        user.comment = data.get('comment', '')
        user.alert_enable = data.get('alert_enable', 0)
        user.save()

        role_ids = data.get('role_ids')

        CrawlUserRoleRel.objects.filter(user_id=user_id).update(is_deleted=1)
        for role_id in role_ids:
            CrawlUserRoleRel.objects.create(role_id=role_id,
                                            user_id=user_id)

        # 权限树写进 redis
        user_roles = CrawlUserRoleRel.objects.filter(user_id=user_id)
        crawl_redis.set('permission#user#{}'.format(user_id), json.dumps(build_permission_tree(user_roles)))

        r = Result.success(None)
        return JsonResponse(r)
コード例 #2
0
ファイル: script_views.py プロジェクト: WNCrawl/WNCrawlWeb
def edit_script_cfg(request):
    """
    编辑爬虫脚本配置
    :param request: request object
    :return:
    """
    try:
        if request.method == 'POST':
            data = json.loads(request.body.decode('utf-8'))
            spider_name = data['spider_name']
            script_name = data['script_name']
            apply_to_all = data['applyToAll']
            task_id = data['project_id']

            script_args = []
            for p in data.get('params'):
                if isinstance(p['args'], str):
                    p['args'] = json.loads(p['args'])
                script_args.append(p)
                if p.get('trigger'):
                    result, message = scheduler_helper.verify_cron(p.get('trigger'))
                    if not result:
                        raise Exception('参数错误: {}'.format(message))

            update_kwargs = {
                "trigger": data.get('trigger'),
                "hosts": data.get('hosts'),
                "args": json.dumps(script_args)}


            # 批量设置当前任务的所有脚本
            if apply_to_all:
                crawl_scripts = CrawlScript.objects.filter(task_id=task_id)
                crawl_scripts.update(**update_kwargs)
            else:
                crawl_scripts = CrawlScript.objects.get(name=spider_name, task_id=task_id)
                crawl_scripts.trigger = data.get('trigger')
                crawl_scripts.hosts = data.get('hosts')
                crawl_scripts.args = json.dumps(script_args)
                crawl_scripts.save()

            if 'params' in data and data['params']:
                args = data['params']
                # 设置每个爬虫脚本的执行参数,不同调度批次的爬虫运行参数使用md5区分
                for arg in args:
                    if apply_to_all:
                        for script in crawl_scripts:
                            v_arg = encrypt_kit.md5(json.dumps(arg))
                            crawl_redis.set("args#{}#{}".format(script.name, v_arg), json.dumps(arg['args']))
                    else:
                        v_arg = encrypt_kit.md5(json.dumps(arg))
                        crawl_redis.set("args#{}#{}".format(spider_name, v_arg), json.dumps(arg['args']))

            r = Result.success("")
            return JsonResponse(r)
    except Exception as e:
        r = Result.fail(e)
        return JsonResponse(r)
コード例 #3
0
def get_access_token(appid, secret):
    access_token = crawl_redis.get('wx_access_token')
    if access_token is not None:
        return bytes.decode(access_token)
    else:
        http_response = urllib.request.urlopen(GET_ACCESS_TOKEN % {
            'appid': appid,
            'secret': secret
        })
        response = json.loads(http_response.read())
        access_token = response.get('access_token')
        crawl_redis.set('wx_access_token', access_token, 7100)
        return access_token
コード例 #4
0
def env_get_access_token(corpid, secret):
    access_token = crawl_redis.get('env_wx_access_token')
    if access_token is not None:
        return bytes.decode(access_token)
    else:
        r = requests.get(ENV_GET_ACCESS_TOKEN % {
            'corpid': corpid,
            'secret': secret
        })
        response = r.json()
        access_token = response.get('access_token')
        expires_in = int(response.get('expires_in'))
        crawl_redis.set('env_wx_access_token', access_token, expires_in - 100)
        return access_token
コード例 #5
0
ファイル: views.py プロジェクト: WNCrawl/WNCrawlWeb
def fetch_user_permissions(request):
    """
    获取用户菜单权限列表
    :param request:
    :return:
    """
    user_id = request.user_id
    user_roles = CrawlUserRoleRel.objects.filter(user_id=user_id, is_deleted=0)
    if not user_roles:
        return JsonResponse(Result.success(data={}))
    permission_tree = build_permission_tree(user_roles)
    crawl_redis.set('permission#user#{}'.format(user_id), json.dumps(permission_tree))
    r = Result.success(data=permission_tree)
    return JsonResponse(r)
コード例 #6
0
ファイル: views.py プロジェクト: WNCrawl/WNCrawlWeb
def create_user(request):
    """
    创建用户
    :param request:
    :return:
    """
    try:
        if request.method == 'POST':
            data = json.loads(request.body.decode('utf-8'))
            username = data.get('username')

            if CrawlUser.objects.filter(username=username, is_deleted=0):
                raise Exception('账号名存在')

            account = data.get('account')
            mobile = data.get('mobile')
            wx_account = data.get('wx_account')
            role_ids = data.get('role_ids')
            alert_options = data.get('alert_options')
            comment = data.get('comment')
            alert_enable = data.get('alert_enable', 0)
            password = random_password(6)
            user = CrawlUser.objects.create(account=account,
                                            username=username,
                                            mobile=mobile,
                                            comment=comment,
                                            wx_account=wx_account,
                                            password=password2md5(password),
                                            alert_enable=alert_enable)
            user_id = user.id
            for role_id in role_ids:
                CrawlUserRoleRel.objects.create(user_id=user_id,
                                                role_id=role_id)

            # 权限树写进 redis
            user_roles = CrawlUserRoleRel.objects.filter(user_id=user_id)
            crawl_redis.set('permission#user#{}'.format(user_id), build_permission_tree(user_roles))

            response = {'username': username,
                        'password': password}
            r = Result.success(response)
            return JsonResponse(r)
    except Exception as e:
        r = Result.fail(e)
        return JsonResponse(r)
コード例 #7
0
ファイル: scheduler.py プロジェクト: WNCrawl/WNCrawlWeb
    def run(self):
        while True:
            try:
                # 新添加待调度的任务
                self.scheduler.remove_all_jobs()
                sync_task_models = CrawlSyncTask.objects.filter(is_deleted=0)
                if not sync_task_models:
                    log_common.warn('任务获取失败')
                    continue
                for sync_model in sync_task_models:
                    node_ports = eval(sync_model.execute_host)
                    if not sync_model.source_cfg:
                        continue
                    source_cfg = eval(sync_model.source_cfg)
                    target_cfg = eval(sync_model.target_cfg)

                    args = {
                        "conditions": source_cfg["source_condition"],
                        "path": target_cfg["target_path"],
                    }
                    trigger = sync_model.schedule_date
                    mix = "{}-{}-{}".format(trigger, sync_model.source_cfg,
                                            sync_model.target_cfg)
                    job_id = "{}-{}".format(str(sync_model.id), mix)
                    md5_job = md5(job_id)
                    crawl_redis.set("sync#cfg#{}".format(md5_job),
                                    json.dumps(args))
                    self.scheduler.add_job(work_func,
                                           trigger="cron",
                                           **eval(trigger),
                                           id=md5_job,
                                           args=[
                                               node_ports, "pro_sync_erp",
                                               "erp_sync", md5_job,
                                               sync_model.id
                                           ])
            except Exception as ex:
                import traceback
                log_common.error("调度数据同步任务失败", ex)
                log_common.error("调度数据同步任务失败 = {}".format(
                    traceback.format_exc()))
            finally:
                connections.close_all()
                time.sleep(3 * 60)
コード例 #8
0
ファイル: script_views.py プロジェクト: WNCrawl/WNCrawlWeb
def script_start(request):
    """
    启动脚本
    :param request:
    :return:
    """
    try:
        if request.method == 'POST':
            data_scripts = json.loads(request.body.decode('utf-8'))

            if not data_scripts:
                return JsonResponse(Result.fail("没有指定脚本"))

            for data_script in data_scripts:
                _job_id = ''
                crawl_script = CrawlScript.objects.get(id=data_script['id'])
                host_list = get_hosts_by_script_id(crawl_script.id)
                for host in host_list:
                    engine = get_engine_by_ip(host)
                    if "args" in data_script and data_script["args"]:
                        for arg in data_script["args"]:
                            if 'dynamic_value' in arg:
                                script_arg = json.loads(arg)
                                sql = script_arg.get('dynamic_value')
                                result = db_kit.fetch_all_to_json(sql)
                                for r in result:
                                    if isinstance(arg, str):
                                        arg = json.loads(arg)
                                    arg['dynamic_value'] = r
                                    batch_id = encrypt_kit.md5(json.dumps(arg))
                                    args = {
                                        "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format(db_conf.redis_host,
                                                                                                            str(
                                                                                                                db_conf.redis_port),
                                                                                                            db_conf.redis_pwd),
                                        "batch_id": batch_id,
                                        "node": host,
                                        "args": arg
                                    }
                                    # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args)
                                    log_common.warn('>>>> 动态分割脚本启动 {}'.format(json.dumps(args)))
                                    _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name, **args)
                                    crawl_redis.set("args#{}".format(batch_id), json.dumps(arg))
                            else:
                                batch_id = encrypt_kit.md5(json.dumps(arg))
                                args = {
                                    "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format(
                                        db_conf.redis_host,
                                        str(
                                            db_conf.redis_port),
                                        db_conf.redis_pwd),
                                    "batch_id": batch_id,
                                    "node": host,
                                    "args": arg
                                }
                                # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args)
                                _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name,
                                                              **args)
                                crawl_redis.set("args#{}".format(batch_id), arg)
                    else:
                        ta = time.strftime('%Y-%m-%d %H:%M:%S')
                        batch_id = encrypt_kit.md5(ta)
                        args = {
                            "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format(db_conf.redis_host,
                                                                                                str(db_conf.redis_port),
                                                                                                db_conf.redis_pwd),
                            "batch_id": batch_id,
                            "node": host,
                            "args": '{}'
                        }
                        _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name, **args)
                        # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args)
                        crawl_redis.set("args#{}".format(batch_id), json.dumps('{}'))
                crawl_script.job_id = _job_id
                crawl_script.save()
            r = Result.success(None)
            return JsonResponse(r)
    except Exception as err:
        r = Result.fail(err)
        return JsonResponse(r)
コード例 #9
0
def schedule_fix_data(nodes, project, spider, spider_id, script_args, job_id, fix_type=0):
    """
    调度补数据逻辑
    :return:
    """
    if isinstance(script_args, str):
        script_args = eval(script_args)
    start_date = script_args.get('conditions').get('start_date')
    end_date = script_args.get('conditions').get('end_date')
    date_list = parse_date(start_date, end_date, fix_type)
    # 只考虑第一台主机
    node = nodes[0]
    engine = get_engine_by_ip(node)

    is_first = True
    index = 0
    last_batch_id = ''

    pub = crawl_redis.pubsub()
    while index < len(date_list):
        pub.subscribe(job_id)
        message = pub.parse_response()
        if is_first or (pub and message[2] != 1 and (message[2]).decode('utf-8') == last_batch_id):
            mix = "{}-{}".format(json.dumps(date_list[index]), json.dumps(script_args))
            batch_id = "fix-{}-{}".format(str(spider_id), md5(mix))
            is_first = False
            last_batch_id = batch_id

            day_type = ''
            if fix_type == 1:
                day_type = 'day'
            elif fix_type == 2:
                day_type = 'week'
            elif fix_type == 3:
                day_type = 'month'

            log_common.warning("project: {}:  spider:{}  batch:{}  trigger: {}".format(project, spider, batch_id, json.dumps(date_list[index])))

            condition = {
                'conditions': {
                    'date_type': day_type,
                    'start_date': date_list[index].get('start_date'),
                    'end_date': date_list[index].get('end_date')
                }
            }

            lock = dlm.lock("dlm#{}".format(batch_id), 1000 * 30)
            if lock:
                index = index + 1
                crawl_redis.set("args#{}".format(batch_id), json.dumps(condition))
                args = {
                    "redis": '{{"host":"{}","port": {},"db":{},"password":"******"}}'.format(db_conf.redis_host,
                                                                                         str(db_conf.redis_port),
                                                                                         str(db_conf.redis_db_name),
                                                                                         db_conf.redis_pwd),
                    "batch_id": batch_id,
                    "node": node,
                    "fix_id": job_id
                }
                jobs = engine.schedule(project, spider, **args)
                script = CrawlScript.objects.get(name=spider, project_name=project)
                script.job_id = jobs
                script.save()
                log_common.warning("补数据任务{}:  {};Jobs:{}".format(project, spider, jobs))
            else:
                log_common.warning("batch:{} locked".format(batch_id))
コード例 #10
0
    def run(self):
        while True:
            try:
                # 清理所有任务
                # self.scheduler.remove_all_jobs()
                log_common.warn('*********** 刷新调度器 **********')
                redis_jobs = self.scheduler.get_jobs()
                redis_job_ids = [rj.id for rj in redis_jobs]
                db_job_ids = []

                script_models = CrawlScript.objects.filter(is_deleted=0, is_disable=0)
                for script_model in script_models:
                    node_list = []
                    if not script_model.hosts or script_model.hosts == '[]':
                        project = CrawlProject.objects.get(id=script_model.project_id)
                        task = CrawlTask.objects.get(id=project.task_id)
                        for node_id in json.loads(task.node_ids):
                            node = CrawlNode.objects.get(id=node_id)
                            node_list.append('{}:{}'.format(node.node_ip, node.node_port))
                    else:
                        node_list = eval(script_model.hosts)
                    json_args = []
                    if script_model.args:
                        json_args = eval(script_model.args)
                    for json_arg in json_args:
                        script_args = json_arg["args"]
                        script_triggers = json_arg["trigger"]
                        fix_type = json_arg["fix_type"]

                        try:
                            if script_triggers:
                                # 补数据逻辑
                                if fix_type in (1, 2, 3):
                                    run_date = json_arg['fix_date']
                                    mix = "{}-{}".format(json.dumps(script_triggers), json.dumps(script_args))
                                    job_id = "fix-{}-{}".format(str(script_model.id), md5(mix))
                                    log_common.warn('添加补数据调度任务: {}'.format(script_model.id))
                                    # 立即测试
                                    # schedule_fix_data(node_list, script_model.project_name, script_model.name, script_model.id, script_args, job_id, fix_type)

                                    # 正常逻辑
                                    db_job_ids.append(job_id)
                                    if datetime.datetime.strptime(run_date, '%Y-%m-%d %H:%M:%S') >= datetime.datetime.now() and job_id not in redis_job_ids:
                                        self.scheduler.add_job(schedule_fix_data,
                                                               'date',
                                                               run_date=run_date,
                                                               id=job_id,
                                                               args=[node_list, script_model.project_name,
                                                                     script_model.name, script_model.id,
                                                                     script_args, job_id, fix_type],
                                                               misfire_grace_time=60)
                                else:
                                    # 动态参数
                                    if json_arg.get('dynamic_value'):
                                        sql = json_arg.get('dynamic_value')
                                        result = db_kit.fetch_all_to_json(sql)
                                        for r in result:
                                            script_args['dynamic_value'] = r
                                            log_common.warn('>>>> 动态切割参数调度 {}, args: {}'.format(script_model.name, script_args))
                                            mix = "{}-{}".format(json.dumps(script_triggers), json.dumps(script_args))
                                            job_id = "{}-{}".format(str(script_model.id), md5(mix))
                                            log_common.warn("args#{}".format(job_id))
                                            crawl_redis.set("args#{}".format(job_id), json.dumps(script_args))
                                            # log_common.warn('添加调度任务: {}'.format(script_model.id))
                                            db_job_ids.append(job_id)
                                            if job_id not in redis_job_ids:
                                                self.scheduler.add_job(work_func,
                                                                       trigger="cron",
                                                                       **script_triggers,
                                                                       id=job_id,
                                                                       args=[node_list, script_model.project_name,
                                                                             script_model.name, job_id],
                                                                       misfire_grace_time=60)
                                    else:
                                        mix = "{}-{}".format(json.dumps(script_triggers), json.dumps(script_args))
                                        job_id = "{}-{}".format(str(script_model.id), md5(mix))
                                        crawl_redis.set("args#{}".format(job_id), json.dumps(script_args))
                                        log_common.warn('添加调度任务: {}'.format(script_model.id))
                                        db_job_ids.append(job_id)
                                        if job_id not in redis_job_ids:
                                            self.scheduler.add_job(work_func,
                                                                   trigger="cron",
                                                                   **script_triggers,
                                                                   id=job_id,
                                                                   args=[node_list, script_model.project_name,
                                                                         script_model.name, job_id],
                                                                   misfire_grace_time=60)
                        except Exception as e:
                            log_common.warn(">>>> 添加报错任务报错: ", e)
                            continue

                c_ids = [i for i in redis_job_ids if i not in db_job_ids]
                for c_id in c_ids:
                    self.scheduler.remove_job(c_id)
                    log_common.warn('移除差异任务: {}'.format(c_id))
                db_job_ids.clear()
            except Exception as ex:
                log_common.warn(ex)
                continue
            finally:
                connections.close_all()
                time.sleep(7 * 60)