def edit_user(request, user_id): """ 修改用户 :param user_id: :param request: request object :return: json """ if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) alert_options = data.get('alert_options') user = CrawlUser.objects.get(id=user_id) user.account = data.get('account') user.mobile = data.get('mobile', '') user.wx_account = data.get('wx_account') user.comment = data.get('comment', '') user.alert_enable = data.get('alert_enable', 0) user.save() role_ids = data.get('role_ids') CrawlUserRoleRel.objects.filter(user_id=user_id).update(is_deleted=1) for role_id in role_ids: CrawlUserRoleRel.objects.create(role_id=role_id, user_id=user_id) # 权限树写进 redis user_roles = CrawlUserRoleRel.objects.filter(user_id=user_id) crawl_redis.set('permission#user#{}'.format(user_id), json.dumps(build_permission_tree(user_roles))) r = Result.success(None) return JsonResponse(r)
def edit_script_cfg(request): """ 编辑爬虫脚本配置 :param request: request object :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) spider_name = data['spider_name'] script_name = data['script_name'] apply_to_all = data['applyToAll'] task_id = data['project_id'] script_args = [] for p in data.get('params'): if isinstance(p['args'], str): p['args'] = json.loads(p['args']) script_args.append(p) if p.get('trigger'): result, message = scheduler_helper.verify_cron(p.get('trigger')) if not result: raise Exception('参数错误: {}'.format(message)) update_kwargs = { "trigger": data.get('trigger'), "hosts": data.get('hosts'), "args": json.dumps(script_args)} # 批量设置当前任务的所有脚本 if apply_to_all: crawl_scripts = CrawlScript.objects.filter(task_id=task_id) crawl_scripts.update(**update_kwargs) else: crawl_scripts = CrawlScript.objects.get(name=spider_name, task_id=task_id) crawl_scripts.trigger = data.get('trigger') crawl_scripts.hosts = data.get('hosts') crawl_scripts.args = json.dumps(script_args) crawl_scripts.save() if 'params' in data and data['params']: args = data['params'] # 设置每个爬虫脚本的执行参数,不同调度批次的爬虫运行参数使用md5区分 for arg in args: if apply_to_all: for script in crawl_scripts: v_arg = encrypt_kit.md5(json.dumps(arg)) crawl_redis.set("args#{}#{}".format(script.name, v_arg), json.dumps(arg['args'])) else: v_arg = encrypt_kit.md5(json.dumps(arg)) crawl_redis.set("args#{}#{}".format(spider_name, v_arg), json.dumps(arg['args'])) r = Result.success("") return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def get_access_token(appid, secret): access_token = crawl_redis.get('wx_access_token') if access_token is not None: return bytes.decode(access_token) else: http_response = urllib.request.urlopen(GET_ACCESS_TOKEN % { 'appid': appid, 'secret': secret }) response = json.loads(http_response.read()) access_token = response.get('access_token') crawl_redis.set('wx_access_token', access_token, 7100) return access_token
def env_get_access_token(corpid, secret): access_token = crawl_redis.get('env_wx_access_token') if access_token is not None: return bytes.decode(access_token) else: r = requests.get(ENV_GET_ACCESS_TOKEN % { 'corpid': corpid, 'secret': secret }) response = r.json() access_token = response.get('access_token') expires_in = int(response.get('expires_in')) crawl_redis.set('env_wx_access_token', access_token, expires_in - 100) return access_token
def fetch_user_permissions(request): """ 获取用户菜单权限列表 :param request: :return: """ user_id = request.user_id user_roles = CrawlUserRoleRel.objects.filter(user_id=user_id, is_deleted=0) if not user_roles: return JsonResponse(Result.success(data={})) permission_tree = build_permission_tree(user_roles) crawl_redis.set('permission#user#{}'.format(user_id), json.dumps(permission_tree)) r = Result.success(data=permission_tree) return JsonResponse(r)
def create_user(request): """ 创建用户 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) username = data.get('username') if CrawlUser.objects.filter(username=username, is_deleted=0): raise Exception('账号名存在') account = data.get('account') mobile = data.get('mobile') wx_account = data.get('wx_account') role_ids = data.get('role_ids') alert_options = data.get('alert_options') comment = data.get('comment') alert_enable = data.get('alert_enable', 0) password = random_password(6) user = CrawlUser.objects.create(account=account, username=username, mobile=mobile, comment=comment, wx_account=wx_account, password=password2md5(password), alert_enable=alert_enable) user_id = user.id for role_id in role_ids: CrawlUserRoleRel.objects.create(user_id=user_id, role_id=role_id) # 权限树写进 redis user_roles = CrawlUserRoleRel.objects.filter(user_id=user_id) crawl_redis.set('permission#user#{}'.format(user_id), build_permission_tree(user_roles)) response = {'username': username, 'password': password} r = Result.success(response) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def run(self): while True: try: # 新添加待调度的任务 self.scheduler.remove_all_jobs() sync_task_models = CrawlSyncTask.objects.filter(is_deleted=0) if not sync_task_models: log_common.warn('任务获取失败') continue for sync_model in sync_task_models: node_ports = eval(sync_model.execute_host) if not sync_model.source_cfg: continue source_cfg = eval(sync_model.source_cfg) target_cfg = eval(sync_model.target_cfg) args = { "conditions": source_cfg["source_condition"], "path": target_cfg["target_path"], } trigger = sync_model.schedule_date mix = "{}-{}-{}".format(trigger, sync_model.source_cfg, sync_model.target_cfg) job_id = "{}-{}".format(str(sync_model.id), mix) md5_job = md5(job_id) crawl_redis.set("sync#cfg#{}".format(md5_job), json.dumps(args)) self.scheduler.add_job(work_func, trigger="cron", **eval(trigger), id=md5_job, args=[ node_ports, "pro_sync_erp", "erp_sync", md5_job, sync_model.id ]) except Exception as ex: import traceback log_common.error("调度数据同步任务失败", ex) log_common.error("调度数据同步任务失败 = {}".format( traceback.format_exc())) finally: connections.close_all() time.sleep(3 * 60)
def script_start(request): """ 启动脚本 :param request: :return: """ try: if request.method == 'POST': data_scripts = json.loads(request.body.decode('utf-8')) if not data_scripts: return JsonResponse(Result.fail("没有指定脚本")) for data_script in data_scripts: _job_id = '' crawl_script = CrawlScript.objects.get(id=data_script['id']) host_list = get_hosts_by_script_id(crawl_script.id) for host in host_list: engine = get_engine_by_ip(host) if "args" in data_script and data_script["args"]: for arg in data_script["args"]: if 'dynamic_value' in arg: script_arg = json.loads(arg) sql = script_arg.get('dynamic_value') result = db_kit.fetch_all_to_json(sql) for r in result: if isinstance(arg, str): arg = json.loads(arg) arg['dynamic_value'] = r batch_id = encrypt_kit.md5(json.dumps(arg)) args = { "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format(db_conf.redis_host, str( db_conf.redis_port), db_conf.redis_pwd), "batch_id": batch_id, "node": host, "args": arg } # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args) log_common.warn('>>>> 动态分割脚本启动 {}'.format(json.dumps(args))) _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name, **args) crawl_redis.set("args#{}".format(batch_id), json.dumps(arg)) else: batch_id = encrypt_kit.md5(json.dumps(arg)) args = { "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format( db_conf.redis_host, str( db_conf.redis_port), db_conf.redis_pwd), "batch_id": batch_id, "node": host, "args": arg } # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args) _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name, **args) crawl_redis.set("args#{}".format(batch_id), arg) else: ta = time.strftime('%Y-%m-%d %H:%M:%S') batch_id = encrypt_kit.md5(ta) args = { "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format(db_conf.redis_host, str(db_conf.redis_port), db_conf.redis_pwd), "batch_id": batch_id, "node": host, "args": '{}' } _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name, **args) # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args) crawl_redis.set("args#{}".format(batch_id), json.dumps('{}')) crawl_script.job_id = _job_id crawl_script.save() r = Result.success(None) return JsonResponse(r) except Exception as err: r = Result.fail(err) return JsonResponse(r)
def schedule_fix_data(nodes, project, spider, spider_id, script_args, job_id, fix_type=0): """ 调度补数据逻辑 :return: """ if isinstance(script_args, str): script_args = eval(script_args) start_date = script_args.get('conditions').get('start_date') end_date = script_args.get('conditions').get('end_date') date_list = parse_date(start_date, end_date, fix_type) # 只考虑第一台主机 node = nodes[0] engine = get_engine_by_ip(node) is_first = True index = 0 last_batch_id = '' pub = crawl_redis.pubsub() while index < len(date_list): pub.subscribe(job_id) message = pub.parse_response() if is_first or (pub and message[2] != 1 and (message[2]).decode('utf-8') == last_batch_id): mix = "{}-{}".format(json.dumps(date_list[index]), json.dumps(script_args)) batch_id = "fix-{}-{}".format(str(spider_id), md5(mix)) is_first = False last_batch_id = batch_id day_type = '' if fix_type == 1: day_type = 'day' elif fix_type == 2: day_type = 'week' elif fix_type == 3: day_type = 'month' log_common.warning("project: {}: spider:{} batch:{} trigger: {}".format(project, spider, batch_id, json.dumps(date_list[index]))) condition = { 'conditions': { 'date_type': day_type, 'start_date': date_list[index].get('start_date'), 'end_date': date_list[index].get('end_date') } } lock = dlm.lock("dlm#{}".format(batch_id), 1000 * 30) if lock: index = index + 1 crawl_redis.set("args#{}".format(batch_id), json.dumps(condition)) args = { "redis": '{{"host":"{}","port": {},"db":{},"password":"******"}}'.format(db_conf.redis_host, str(db_conf.redis_port), str(db_conf.redis_db_name), db_conf.redis_pwd), "batch_id": batch_id, "node": node, "fix_id": job_id } jobs = engine.schedule(project, spider, **args) script = CrawlScript.objects.get(name=spider, project_name=project) script.job_id = jobs script.save() log_common.warning("补数据任务{}: {};Jobs:{}".format(project, spider, jobs)) else: log_common.warning("batch:{} locked".format(batch_id))
def run(self): while True: try: # 清理所有任务 # self.scheduler.remove_all_jobs() log_common.warn('*********** 刷新调度器 **********') redis_jobs = self.scheduler.get_jobs() redis_job_ids = [rj.id for rj in redis_jobs] db_job_ids = [] script_models = CrawlScript.objects.filter(is_deleted=0, is_disable=0) for script_model in script_models: node_list = [] if not script_model.hosts or script_model.hosts == '[]': project = CrawlProject.objects.get(id=script_model.project_id) task = CrawlTask.objects.get(id=project.task_id) for node_id in json.loads(task.node_ids): node = CrawlNode.objects.get(id=node_id) node_list.append('{}:{}'.format(node.node_ip, node.node_port)) else: node_list = eval(script_model.hosts) json_args = [] if script_model.args: json_args = eval(script_model.args) for json_arg in json_args: script_args = json_arg["args"] script_triggers = json_arg["trigger"] fix_type = json_arg["fix_type"] try: if script_triggers: # 补数据逻辑 if fix_type in (1, 2, 3): run_date = json_arg['fix_date'] mix = "{}-{}".format(json.dumps(script_triggers), json.dumps(script_args)) job_id = "fix-{}-{}".format(str(script_model.id), md5(mix)) log_common.warn('添加补数据调度任务: {}'.format(script_model.id)) # 立即测试 # schedule_fix_data(node_list, script_model.project_name, script_model.name, script_model.id, script_args, job_id, fix_type) # 正常逻辑 db_job_ids.append(job_id) if datetime.datetime.strptime(run_date, '%Y-%m-%d %H:%M:%S') >= datetime.datetime.now() and job_id not in redis_job_ids: self.scheduler.add_job(schedule_fix_data, 'date', run_date=run_date, id=job_id, args=[node_list, script_model.project_name, script_model.name, script_model.id, script_args, job_id, fix_type], misfire_grace_time=60) else: # 动态参数 if json_arg.get('dynamic_value'): sql = json_arg.get('dynamic_value') result = db_kit.fetch_all_to_json(sql) for r in result: script_args['dynamic_value'] = r log_common.warn('>>>> 动态切割参数调度 {}, args: {}'.format(script_model.name, script_args)) mix = "{}-{}".format(json.dumps(script_triggers), json.dumps(script_args)) job_id = "{}-{}".format(str(script_model.id), md5(mix)) log_common.warn("args#{}".format(job_id)) crawl_redis.set("args#{}".format(job_id), json.dumps(script_args)) # log_common.warn('添加调度任务: {}'.format(script_model.id)) db_job_ids.append(job_id) if job_id not in redis_job_ids: self.scheduler.add_job(work_func, trigger="cron", **script_triggers, id=job_id, args=[node_list, script_model.project_name, script_model.name, job_id], misfire_grace_time=60) else: mix = "{}-{}".format(json.dumps(script_triggers), json.dumps(script_args)) job_id = "{}-{}".format(str(script_model.id), md5(mix)) crawl_redis.set("args#{}".format(job_id), json.dumps(script_args)) log_common.warn('添加调度任务: {}'.format(script_model.id)) db_job_ids.append(job_id) if job_id not in redis_job_ids: self.scheduler.add_job(work_func, trigger="cron", **script_triggers, id=job_id, args=[node_list, script_model.project_name, script_model.name, job_id], misfire_grace_time=60) except Exception as e: log_common.warn(">>>> 添加报错任务报错: ", e) continue c_ids = [i for i in redis_job_ids if i not in db_job_ids] for c_id in c_ids: self.scheduler.remove_job(c_id) log_common.warn('移除差异任务: {}'.format(c_id)) db_job_ids.clear() except Exception as ex: log_common.warn(ex) continue finally: connections.close_all() time.sleep(7 * 60)