def project_deploy(request, project_name): """ 发布爬虫工程 :param request: request object :param project_name: project name :return: json of deploy result """ if request.method == 'POST': path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(path, project_name) # 检索打包egg文件 egg = find_egg(project_path) if not egg: r = Result.success("没有打包文件") return JsonResponse(r) egg_file = open(join(project_path, egg), 'rb') data = json.loads(request.body.decode('utf-8')) node_ids = data["node_ids"] nodes = CrawlNode.objects.filter(id__in=node_ids) project = CrawlProject.objects.get(name=project_name) for node in nodes: engine = get_engine(node) engine.add_version(project_name, int(time.time()), egg_file.read()) deployed_at = timezone.now() CrawlDeploy.objects.filter( node_id=node.id, project_id=project.id).delete() # 这里逻辑删除 deploy, result = CrawlDeploy.objects.update_or_create( node_id=node.id, project_id=project.id, deployed_at=deployed_at, description=project.description) r = Result.success("") return JsonResponse(r)
def script_stop(request): """ 启动脚本 :param request: :return: """ try: if request.method == 'POST': data_scripts = json.loads(request.body.decode('utf-8')) if not data_scripts: return JsonResponse(Result.fail("没有指定脚本")) for data_script in data_scripts: crawl_script = CrawlScript.objects.get(id=data_script["id"]) host_list = get_hosts_by_script_id(crawl_script.id) for host in host_list: engine = get_engine_by_ip(host) args = { "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format(db_conf.redis_host, str(db_conf.redis_port), db_conf.redis_pwd), "batch_id": '' } jobs = engine.cancel(crawl_script.project_name, crawl_script.name) r = Result.success(None) return JsonResponse(r) except Exception as err: r = Result.fail(err) return JsonResponse(r)
def script_newest_log(request): """ 获取脚本最新日志 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) script_id = data.get('script_id') host_ip = data.get('host_ip') script = CrawlScript.objects.get(id=script_id) project_name = script.project_name spider_name = script.name job_id = script.job_id if not job_id: r = Result.success('暂无日志') return JsonResponse(r) url = 'http://{}/logs/{}/{}/{}.log'.format(host_ip, project_name, spider_name, job_id) response = requests.get(url) if response.status_code != 200: r = Result.success('暂无日志') return JsonResponse(r) log_content = response.content.decode('utf-8') r = Result.success({'message': log_content}) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def task_by_script_id(request, script_id): """ 根据脚本 id 获取任务 :param script_id: :param request: :return: """ work_path = os.getcwd() try: if request.method == 'GET': script = CrawlScript.objects.get(id=script_id) project = CrawlProject.objects.get(id=script.project_id) task = CrawlTask.objects.get(id=project.task_id) path = os.path.abspath(join(work_path, PROJECTS_FOLDER)) script_name = script.name vo = model_to_dict(task) vo.__setitem__('path', path) vo.__setitem__('script_name', script.script_file) r = Result.success(vo) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def find_debug_result(request): """ 查看测试执行结果 :param request: :return: """ work_path = os.getcwd() try: if request.method == 'GET': project_name = request.GET.get('project_name') spider_name = request.GET.get('spider_name') project_path = join(PROJECTS_FOLDER, project_name) os.chdir(project_path) if not os.path.exists("debug_folder"): r = Result.success(data='') return JsonResponse(r) input_file = open('./debug_folder/items/{}.json'.format(spider_name)) all_text = input_file.read() input_file.close() r = Result.success({'content': all_text}) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def find_debug_log(request): """ 查看测试执行日志 :param request: :return: """ work_path = os.getcwd() try: if request.method == 'GET': project_name = request.GET.get('project_name') spider_name = request.GET.get('spider_name') current_line = int(request.GET.get('current_line')) project_path = join(PROJECTS_FOLDER, project_name) os.chdir(project_path) if not os.path.exists("debug_folder"): r = Result.success(data='') return JsonResponse(r) input_file = open('./debug_folder/logs/{}.log'.format(spider_name), 'r', encoding='utf-8') lines = input_file.readlines() input_file.close() response = [] for line in lines[(current_line - 1):]: data = {'current_line': current_line, 'data': line} response.append(data) current_line = current_line + 1 r = Result.success(response) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def project_build(request, project_name): """ 爬虫工程编译打包 :param request: request object :param project_name: project name :return: json """ path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(path, project_name) if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) description = data['description'] build_project( project_name, include_data=False if project_name != 'auto_login' else True) egg = find_egg(project_path) if not egg: return JsonResponse(Result.fail("编译打包失败")) built_at = timezone.now() if not CrawlProject.objects.filter(name=project_name): CrawlProject(name=project_name, description=description, built_at=built_at, egg=egg).save() model = CrawlProject.objects.get(name=project_name) else: model = CrawlProject.objects.get(name=project_name, is_deleted=0) model.built_at = built_at model.egg = egg model.description = description model.save() data = model_to_dict(model) r = Result.success(data) return JsonResponse(r)
def reset_profile_pwd(request, user_id): """ 重置个人密码 :param user_id: :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) old_pwd = data.get('old_pwd') new_pwd = data.get('new_pwd') confirm_pwd = data.get('confirm_pwd') user = CrawlUser.objects.get(id=user_id) if confirm_pwd != new_pwd: raise Exception('两次密码输入不一致') db_pwd = user.password if db_pwd != password2md5(old_pwd): raise Exception('密码不正确') user.password = password2md5(new_pwd) user.save() r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def login(request): """ 登录(TODO 使用jwt) :param request: :return: """ try: domain = settings.SESSION_COOKIE_DOMAIN if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) username = data.get('username').strip() password = data.get('password').strip() user = CrawlUser.objects.get(username=username) if not user: raise Exception('用户名或密码不正确') else: if password2md5(password) == user.password: token = jwt_tools.encode_token(user.id, user.username) r = Result.success(None) response = JsonResponse(r) response.set_cookie('dt_token', bytes.decode(token), domain=domain, max_age=60 * 60 * 24 * 30) response.set_cookie('dt_user_id', user.id, domain=domain, max_age=60 * 60 * 24 * 30) response.set_cookie('dt_username', user.username, domain=domain, max_age=60 * 60 * 24 * 30) return response else: raise Exception('用户名或密码不正确') except Exception as e: r = Result.fail(e) return JsonResponse(r)
def list_proxy_ip(request): """ 所有代理 ip :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) keyword = data.get('keyword') page = data.get('page', 1) size = data.get('size', 15) ip_type = data.get('ip_type') status = request.GET.get('status') proxy_ips = CrawlProxyIP.objects.filter(is_deleted=0) if keyword is not None: proxy_ips = proxy_ips.filter(ip__icontains=keyword) if ip_type is not None: proxy_ips = proxy_ips.filter(ip_type=ip_type) if status is not None: proxy_ips = proxy_ips.filter(status=status) total = proxy_ips.count() r = Result.success(page_helper(total, page, size, proxy_ips)) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def process_request(self, request): try: if DEBUG: return if request.path in self.white_list: return dt_token = request.COOKIES.get('dt_token') dt_user_id = request.COOKIES.get('dt_user_id') dt_username = request.COOKIES.get('dt_username') if not dt_token: return JsonResponse(Result.fail("缺少token"), status=403) res = jwt_tools.decode_token(dt_token) if not jwt_tools.verify(res): return JsonResponse(Result.fail("非法token"), status=403) # 检查权限 if not self.filter_auth(dt_user_id, dt_username, request.path): r = Result.fail("无权限访问该资源") return JsonResponse(r, status=403) request.user_id = dt_user_id request.user_name = dt_username except ExpiredSignatureError as e: r = Result.fail("登录过期") return JsonResponse(r, status=403) except Exception as e: r = Result.fail("非法登录") return JsonResponse(r, status=403)
def edit_script_cfg(request): """ 编辑爬虫脚本配置 :param request: request object :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) spider_name = data['spider_name'] script_name = data['script_name'] apply_to_all = data['applyToAll'] task_id = data['project_id'] script_args = [] for p in data.get('params'): if isinstance(p['args'], str): p['args'] = json.loads(p['args']) script_args.append(p) if p.get('trigger'): result, message = scheduler_helper.verify_cron(p.get('trigger')) if not result: raise Exception('参数错误: {}'.format(message)) update_kwargs = { "trigger": data.get('trigger'), "hosts": data.get('hosts'), "args": json.dumps(script_args)} # 批量设置当前任务的所有脚本 if apply_to_all: crawl_scripts = CrawlScript.objects.filter(task_id=task_id) crawl_scripts.update(**update_kwargs) else: crawl_scripts = CrawlScript.objects.get(name=spider_name, task_id=task_id) crawl_scripts.trigger = data.get('trigger') crawl_scripts.hosts = data.get('hosts') crawl_scripts.args = json.dumps(script_args) crawl_scripts.save() if 'params' in data and data['params']: args = data['params'] # 设置每个爬虫脚本的执行参数,不同调度批次的爬虫运行参数使用md5区分 for arg in args: if apply_to_all: for script in crawl_scripts: v_arg = encrypt_kit.md5(json.dumps(arg)) crawl_redis.set("args#{}#{}".format(script.name, v_arg), json.dumps(arg['args'])) else: v_arg = encrypt_kit.md5(json.dumps(arg)) crawl_redis.set("args#{}#{}".format(spider_name, v_arg), json.dumps(arg['args'])) r = Result.success("") return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def list_task_progress(request): """ 爬虫任务进度 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) keyword = data.get('keyword') script_name = data.get('script_name') date = data.get('date') status = data.get('status') page = data.get('page', 1) size = data.get('size', 15) task_progress = CrawlScriptProgress.objects.filter(is_deleted=0).exclude(script_name='proxy') condition_date = datetime.datetime.today().strftime('%Y-%m-%d') if date == '' else date stat_task_progress = task_progress.filter(start_time__gte='{} 00:00:00'.format(condition_date), start_time__lte='{} 23:59:59'.format(condition_date)) running_cnt = stat_task_progress.filter(status=1).count() success_cnt = stat_task_progress.filter(status=2).count() fail_cnt = stat_task_progress.filter(status=-1).count() if keyword is not None and keyword != '': task_progress = task_progress.filter(task_name__icontains=keyword) if script_name is not None and script_name != '': task_progress = task_progress.filter(script_name__icontains=script_name) if date is not None and date != '': task_progress = task_progress.filter(start_time__gte='{} 00:00:00'.format(date), start_time__lte='{} 23:59:59'.format(date)) if status is not None: task_progress = task_progress.filter(status__in=status) task_progress = task_progress.order_by("-id") total = task_progress.count() pager = page_helper(total, page, size, task_progress, {'fail_cnt': fail_cnt, 'running_cnt': running_cnt, 'success_cnt': success_cnt}) convert_task_progress = [] results = pager.get('results') for result in results: result['run_time'] = time_kit.convert_ms(result.get('run_time')) result['script_id'] = CrawlScript.objects.get(task_name=result.get('task_name'), name=result.get('script_name')).id convert_task_progress.append(result) pager['results'] = convert_task_progress r = Result.success(pager) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def fetch_user_permissions(request): """ 获取用户菜单权限列表 :param request: :return: """ user_id = request.user_id user_roles = CrawlUserRoleRel.objects.filter(user_id=user_id, is_deleted=0) if not user_roles: return JsonResponse(Result.success(data={})) permission_tree = build_permission_tree(user_roles) crawl_redis.set('permission#user#{}'.format(user_id), json.dumps(permission_tree)) r = Result.success(data=permission_tree) return JsonResponse(r)
def script_remove(request): """ 删除脚本 :param request: :return: """ try: if request.method == 'GET': id = request.GET['id'] CrawlScript.objects.get(id=id).delete() r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def project_list(request, node_id): """ 获取某个node节点上的爬虫工程 :param request: request object :param node_id: node_id :return: json """ if request.method == 'GET': client = CrawlNode.objects.get(id=node_id) engine = get_engine(client) try: projects = engine.list_projects() JsonResponse(Result.success(data=projects)) except ConnectionError: return JsonResponse(Result.fail())
def create_proxy_ip(request): """ 创建代理 ip :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) CrawlProxyIP.objects.create(**data) r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def get_proxy_ip(request, proxy_ip_id): """ 获取一个代理 ip :param request: :param proxy_ip_id: :return: """ try: if request.method == 'GET': proxy_ip = CrawlProxyIP.objects.get(id=proxy_ip_id) r = Result.success(model_to_dict(proxy_ip)) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def get_hosts(request): """ 根据脚本 id 获取 hosts :param request: :return: """ try: if request.method == 'GET': script_id = request.GET.get('script_id') hosts = get_hosts_by_script_id(script_id) r = Result.success(hosts) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def task_deploy(request, project_name): try: log_common.info('进入发布方法') work_path = os.getcwd() if request.method == 'GET': log_common.info('开始发布逻辑') path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(path, project_name) # 检索打包egg文件 egg = find_egg(project_path) if not egg: raise Exception('没有打包文件') egg_file = open(join(project_path, egg), 'rb') egg_file_content = egg_file.read() project = CrawlProject.objects.get(name=project_name, is_deleted=0) task = CrawlTask.objects.get(id=project.task_id) task.is_deploy = 1 task.save() for node_id in json.loads(task.node_ids): node = CrawlNode.objects.get(id=node_id) engine = get_engine(node) log_common.info('{}: 准备发布{}'.format(node.node_ip, project_name)) engine.add_version(project_name, int(time.time()), egg_file_content) log_common.info('{}: 发布成功{}'.format(node.node_ip, project_name)) # update deploy info deployed_at = timezone.now() CrawlDeploy.objects.filter( node_id=node.id, project_id=project.id).update(is_deleted=1) deploy, result = CrawlDeploy.objects.update_or_create( node_id=node.id, project_id=project.id, deployed_at=deployed_at, description=project.description) r = Result.success("") return JsonResponse(r) except Exception as e: import traceback log_common.error("task_deploy => ", e) log_common.error("task_deploy => {}".format(traceback.format_exc())) r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def edit_user(request, user_id): """ 修改用户 :param user_id: :param request: request object :return: json """ if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) alert_options = data.get('alert_options') user = CrawlUser.objects.get(id=user_id) user.account = data.get('account') user.mobile = data.get('mobile', '') user.wx_account = data.get('wx_account') user.comment = data.get('comment', '') user.alert_enable = data.get('alert_enable', 0) user.save() role_ids = data.get('role_ids') CrawlUserRoleRel.objects.filter(user_id=user_id).update(is_deleted=1) for role_id in role_ids: CrawlUserRoleRel.objects.create(role_id=role_id, user_id=user_id) # 权限树写进 redis user_roles = CrawlUserRoleRel.objects.filter(user_id=user_id) crawl_redis.set('permission#user#{}'.format(user_id), json.dumps(build_permission_tree(user_roles))) r = Result.success(None) return JsonResponse(r)
def list_scripts(request): """ 某个爬虫工程节点爬虫脚本分布列表 :param request: :return: """ if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) size = data.get('size', 15) page = data.get('page', 1) task_name = data.get("task_name") task_id = data.get("task_id") script_name = data.get("script_name") scripts = CrawlScript.objects.filter(is_deleted=0) if task_id: scripts = scripts.filter(task_id=task_id) if script_name: scripts = scripts.filter(name__contains=script_name) if task_name: scripts = scripts.filter(task_name__contains=task_name) scripts = scripts.order_by("-id") total = scripts.count() response = page_helper(total, page, size, scripts) results = response.get('results') for result in results: result.__setitem__('hosts', ','.join(get_hosts_by_script_id(result.get('id')))) r = Result.success(response) return JsonResponse(r)
def index_status(request): """ 统计工程状态 :param request: request object :return: json """ if request.method == 'GET': work_path = os.getcwd() try: nodes = CrawlNode.objects.all() data = { 'success': 0, 'error': 0, 'project': 0, } for client in nodes: try: requests.get(engine_url(client.ip, client.port), timeout=1) data['success'] += 1 except ConnectionError: data['error'] += 1 path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) files = os.listdir(path) for file in files: if os.path.isdir(join(path, file)) and file not in IGNORES: data['project'] += 1 return JsonResponse(data) except Exception as e: r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def node_create(request): """ 创建爬虫节点 :param request: request object :return: json """ if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) cn = CrawlNode.objects.filter(node_ip=data["node_ip"]).last() if not cn: node = CrawlNode.objects.create(**data) r = Result.success(model_to_dict(node)) return JsonResponse(r) else: # 更改心跳时间,表示节点存活 return JsonResponse(Result.fail('节点已存在'))
def script_enable(request): """ 启用脚本 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) ids = data.get('ids') control_script(ids, 0) r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def list_role(request): """ 角色列表 :param request: :return: """ if request.method == 'GET': page = request.GET.get('page', 1) size = request.GET.get('size', 15) response = [] roles = CrawlRole.objects.filter(is_deleted=0) relsDict = CrawlPermission.objects.filter(is_deleted=0) for role in roles: rels = CrawlRolePermission.objects.filter(is_deleted=0, role_id=role.id) role_permissions = [] for rel in rels: permissions = relsDict.get(id=rel.permission_id) role_permissions.append(model_to_dict(permissions).get('permission_name')) roleD = model_to_dict(role) roleD.__setitem__('permission', role_permissions) response.append(roleD) r = Result.success(response) return JsonResponse(r)
def remove_proxy_ip(request, proxy_ip_id): """ 删除代理 ip :param request: :param proxy_ip_id: :return: """ try: if request.method == 'GET': proxy_ip = CrawlProxyIP.objects.get(id=proxy_ip_id) proxy_ip.is_deleted = 1 proxy_ip.save() r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def create_user(request): """ 创建用户 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) username = data.get('username') if CrawlUser.objects.filter(username=username, is_deleted=0): raise Exception('账号名存在') account = data.get('account') mobile = data.get('mobile') wx_account = data.get('wx_account') role_ids = data.get('role_ids') alert_options = data.get('alert_options') comment = data.get('comment') alert_enable = data.get('alert_enable', 0) password = random_password(6) user = CrawlUser.objects.create(account=account, username=username, mobile=mobile, comment=comment, wx_account=wx_account, password=password2md5(password), alert_enable=alert_enable) user_id = user.id for role_id in role_ids: CrawlUserRoleRel.objects.create(user_id=user_id, role_id=role_id) # 权限树写进 redis user_roles = CrawlUserRoleRel.objects.filter(user_id=user_id) crawl_redis.set('permission#user#{}'.format(user_id), build_permission_tree(user_roles)) response = {'username': username, 'password': password} r = Result.success(response) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def project_tree(request, project_name): """ 获取爬虫工程树形数据 :param request: request object :param project_name: project name :return: json of tree """ work_cwd = os.getcwd() try: if request.method == 'GET': path = os.path.abspath(join(work_cwd, PROJECTS_FOLDER)) tree = get_tree(join(path, project_name)) r = Result.success(tree) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_cwd)
def node_index(request): """ 获取节点列表 :param request: request object :return: client list """ if request.method == 'GET': data = CrawlNode.objects.filter(is_deleted=0).order_by('-id') r = Result.success(data) return JsonResponse(r)