def task_deploy(request, project_name): try: log_common.info('进入发布方法') work_path = os.getcwd() if request.method == 'GET': log_common.info('开始发布逻辑') path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(path, project_name) # 检索打包egg文件 egg = find_egg(project_path) if not egg: raise Exception('没有打包文件') egg_file = open(join(project_path, egg), 'rb') egg_file_content = egg_file.read() project = CrawlProject.objects.get(name=project_name, is_deleted=0) task = CrawlTask.objects.get(id=project.task_id) task.is_deploy = 1 task.save() for node_id in json.loads(task.node_ids): node = CrawlNode.objects.get(id=node_id) engine = get_engine(node) log_common.info('{}: 准备发布{}'.format(node.node_ip, project_name)) engine.add_version(project_name, int(time.time()), egg_file_content) log_common.info('{}: 发布成功{}'.format(node.node_ip, project_name)) # update deploy info deployed_at = timezone.now() CrawlDeploy.objects.filter( node_id=node.id, project_id=project.id).update(is_deleted=1) deploy, result = CrawlDeploy.objects.update_or_create( node_id=node.id, project_id=project.id, deployed_at=deployed_at, description=project.description) r = Result.success("") return JsonResponse(r) except Exception as e: import traceback log_common.error("task_deploy => ", e) log_common.error("task_deploy => {}".format(traceback.format_exc())) r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def build_egg(project, include_data=False): """ 构建egg包 :param project: :param include_data :return: """ work_path = os.getcwd() try: path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(path, project) os.chdir(project_path) settings = config(project_path, 'settings', 'default') if include_data: create_data_setup_py(project_path, settings=settings, project=project) else: create_default_setup_py(project_path, settings=settings, project=project) d = tempfile.mkdtemp(prefix="dt-") o = open(os.path.join(d, "stdout"), "wb") e = open(os.path.join(d, "stderr"), "wb") retry_on_eintr( check_call, ['python', 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=o, stderr=e) # retry_on_eintr(check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], # stdout=o, stderr=e) o.close() e.close() egg = glob.glob(os.path.join(d, '*.egg'))[0] # Delete Origin file if find_egg(project_path): os.remove(join(project_path, find_egg(project_path))) shutil.move(egg, project_path) return join(project_path, find_egg(project_path)) except Exception as e: import traceback log_common.error(">build_egg ", e) log_common.error(">build_egg = {}", traceback.format_exc()) finally: os.chdir(work_path)
def run(self): while True: try: # 新添加待调度的任务 self.scheduler.remove_all_jobs() sync_task_models = CrawlSyncTask.objects.filter(is_deleted=0) if not sync_task_models: log_common.warn('任务获取失败') continue for sync_model in sync_task_models: node_ports = eval(sync_model.execute_host) if not sync_model.source_cfg: continue source_cfg = eval(sync_model.source_cfg) target_cfg = eval(sync_model.target_cfg) args = { "conditions": source_cfg["source_condition"], "path": target_cfg["target_path"], } trigger = sync_model.schedule_date mix = "{}-{}-{}".format(trigger, sync_model.source_cfg, sync_model.target_cfg) job_id = "{}-{}".format(str(sync_model.id), mix) md5_job = md5(job_id) crawl_redis.set("sync#cfg#{}".format(md5_job), json.dumps(args)) self.scheduler.add_job(work_func, trigger="cron", **eval(trigger), id=md5_job, args=[ node_ports, "pro_sync_erp", "erp_sync", md5_job, sync_model.id ]) except Exception as ex: import traceback log_common.error("调度数据同步任务失败", ex) log_common.error("调度数据同步任务失败 = {}".format( traceback.format_exc())) finally: connections.close_all() time.sleep(3 * 60)
def node_spider_info(request): if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) node_url = engine_url(data['node_ip'], data['node_port']) client_d = {} try: client_d['status'] = 'disconnect' client_d['projects_count'] = 0 client_d['projects'] = [] client_d['spiders_count'] = client_d['pending'] = client_d[ 'running'] = client_d['finished'] = 0 response = requests.get(node_url + '/listprojects.json', timeout=2) if response: info = json.loads(response.text) client_d['projects_count'] = len(info['projects']) client_d['projects'] = info['projects'] client_d['status'] = info['status'] for project in info['projects']: project_info = json.loads( requests.get(node_url + '/listspiders.json?project=' + project, timeout=2).text) client_d['spiders_count'] = client_d[ 'spiders_count'] + len(project_info['spiders']) project_info = json.loads( requests.get(node_url + '/listjobs.json?project=' + project, timeout=2).text) client_d['pending'] = client_d['pending'] + len( project_info['pending']) client_d['running'] = client_d['running'] + len( project_info['running']) client_d['finished'] = client_d['running'] + len( project_info['finished']) except (requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, requests.exceptions.HTTPError) as e: log_common.error(e) r = Result.success(client_d) return JsonResponse(r)
def run(self): while True: try: while crawl_redis.llen('crawl_delay_queue') > 0: log_common.info('当前延迟处理队列中存在{}个待执行延迟任务'.format( str(crawl_redis.llen('crawl_delay_queue')))) arg = crawl_redis.blpop('crawl_delay_queue', timeout=3) if arg: run_arg = json.loads(arg[1]) project = run_arg.get('project') spider = run_arg.get('spider') host = run_arg.get('host') port = run_arg.get('port') args = run_arg.get('args') engine = get_general_engine(host, port) engine_kit.schedule(engine, project, spider, **args) time.sleep(3) except Exception as e: log_common.error('>>>> [DelayTaskSchedulerWork] 调度出现异常', e) finally: time.sleep(7 * 60)
def work_func(nodes, project, spider, md5_job, task_id): log_common.warn("当前同步任务执行节点:{}".format(json.dumps(nodes))) # apscheduler bug fix try: lock = dlm.lock("dlm#{}".format(md5_job), 1000 * 30) if lock: for node in nodes: # 这里检查运行节点的活跃健康 engine = get_engine_by_ip(node) try: args = { "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'. format(db_conf.redis_host, str(db_conf.redis_port), db_conf.redis_pwd), "batch_id": md5_job, "task_id": task_id } jobs = engine.schedule(project, spider, **args) task = CrawlSyncTask.objects.get(id=task_id) task.job_id = jobs task.save() log_common.warning("{} ,{}: {};Jobs:{}".format( str(task_id), project, spider, jobs)) except Exception as err: import traceback log_common.error("请发布任务到", err) log_common.error("发布分发任务失败:{}".format( traceback.format_exc())) else: log_common.warning("batch:{} locked".format(md5_job)) finally: pass
def collect_script_progress(request): """ 采集接收保存任务执行数据 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) script_name = data["script_name"] batch = data["batch"] script_progress = CrawlScriptProgress.objects.filter(script_name=script_name, batch=batch) arg_key = data.get('arg_key') if arg_key: data['arg'] = bytes.decode(crawl_redis.get('args#{}'.format(arg_key))) log_common.error('script_name: {}, batch_id: {}'.format(script_name, batch)) if script_progress: log_common.error('update progress script_name:{}, batch_id: {}'.format(script_name, batch)) sp = script_progress[0] data["task_name"] = sp.task_name data["id"] = sp.id if data['status'] == -1 and not data.get('msg') and sp.msg: data['msg'] = sp.msg result = script_progress.update(**data) if data['status'] == -1: user_alert_rel = CrawlUserAlertRel.objects.filter(alert_id=12, is_deleted=0) user_ids = list(map(lambda x: str(x.user_id), user_alert_rel)) to_user = '******'.join(user_ids) wx_tools.env_send_card_message(to_user, '爬虫异常', '爬虫: {} 发生异常'.format(script_name)) else: try: log_common.error('new progress script_name:{}, batch_id: {}'.format(script_name, batch)) css = CrawlScript.objects.filter(name=script_name, is_deleted=0) if css: cs = css[0] data["task_name"] = cs.task_name result = CrawlScriptProgress.objects.create(**data) else: log_common.warn("no find {} of task!".format(script_name)) except IntegrityError as e: log_common.error('>>>>>>>>>>>>>>>>>>> catch IntegrityError >>>>>>>>>>>>>>>>>>>>>') # 处理并发情况下脚本上报两次的情况 script_progress = CrawlScriptProgress.objects.filter(script_name=script_name, batch=batch) sp = script_progress[0] data["task_name"] = sp.task_name data["id"] = sp.id result = script_progress.update(**data) if data['status'] == -1: user_alert_rel = CrawlUserAlertRel.objects.filter(alert_id=12, is_deleted=0) user_ids = list(map(lambda x: x.user_id, user_alert_rel)) to_user = '******'.join(user_ids) wx_tools.env_send_card_message(to_user, '爬虫异常', '爬虫: {} 发生异常'.format(script_name)) r = Result.success({}) return JsonResponse(r) except Exception as e: import traceback log_common.error('v3v3:上报数据异常,具体错误 = {}'.format(traceback.format_exc())) r = Result.fail(e) return JsonResponse(r)