def client_status(request, id): if request.method == 'GET': client = Client.objects.get(id=id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: scrapyd.list_projects() return HttpResponse('1') except: return HttpResponse('0')
def ScrapydData(request): from scrapyd_api import ScrapydAPI scrapyd = ScrapydAPI('http://localhost:6800') settings = {'DOWNLOAD_DELAY': 2} # 运行一个爬虫 scrapyd.schedule('project_name', 'spider_name', settings=settings) print(scrapyd.list_projects())
def get_scrapyd(client): url = 'http://{ip}:{port}'.format(ip=client.ip, port=client.port) try: scrapyd = ScrapydAPI(url) result = scrapyd.list_projects() return scrapyd except (ConnectionError, InvalidURL): return False
def project_list(request, client_id): """ project deployed list on one client :param request: request object :param client_id: client id :return: json """ if request.method == 'GET': client = Client.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: projects = scrapyd.list_projects() return JsonResponse(projects) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
class ScrapyAgent(object): """ scrapy项目代理类 """ def __init__(self, server_url): self.server_url = server_url self.scrapyd_api = ScrapydAPI(server_url) def __repr__(self): return '<ScrapyAgent %s>' % self.server_url @property def server(self): return self.server_url def list_projects(self): return self.scrapyd_api.list_projects() def del_project(self, project_name): try: return self.scrapyd_api.delete_project(project_name) except: return False def list_spiders(self, project_name): return self.scrapyd_api.list_spiders(project_name) def start_spider(self, project_name, spider_name): return self.scrapyd_api.schedule(project_name, spider_name) def cancel_spider(self, project_name, job_id): return self.scrapyd_api.cancel(project_name, job_id) def deploy(self, project_name: str, version: int, egg_byte: BinaryIO) -> "Dict or bool": spider_num = self.scrapyd_api.add_version(project_name, version, egg_byte) return { 'project': project_name, 'version': version, 'spiders': spider_num, } if spider_num else False def log_url(self, project_name, spider_name, job_id): return '{}/logs/{}/{}/{}'\ .format(self.server_url, project_name, spider_name, job_id) def job_status(self, project_name, job_id): return self.scrapyd_api.job_status(project_name, job_id)
def project_list(request, node_id): """ project deployed list on one node :param request: request object :param node_id: node id :return: json """ if request.method == 'GET': node = Node.objects.get(id=node_id) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: projects = scrapyd.list_projects() lis = [] for project in projects: lis.append({'spider_name': project}) return JsonResponse({'result': 1, 'lis': lis}) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
class Spider(Resource): def __init__(self, url): self.scrapyd = ScrapydAPI(url) def get(self): data = [] for p in self.scrapyd.list_projects(): spiders = reduce_spiders(listjobs(self.scrapyd.list_jobs(p), p)) data.extend(spiders) return data def post(self, name): p, s = name.split('.') jobs = self.scrapyd.list_jobs(p) for job in (jobs['running'] + jobs['pending']): if job['spider'] == s: return 'Already Running' return self.scrapyd.schedule(p, s)
def start_hot_spider(self, request): """ 启动每日热点的爬虫,需要验证admin权限 :param request: user_type:用户的身份权限 :return: """ data = request.data user_type = data.get("user_type") if user_type == "1": #admin权限,开启爬虫 # run_hot() scrapyd = ScrapydAPI('http://localhost:6800') # 这里是去调用部署分布式爬虫 print(scrapyd.list_projects()) #获取爬虫项目名 # print(scrapyd.list_spiders('default'))#获取爬虫项目中的爬虫工程名 # print(scrapyd.list_jobs('default')) #获取爬虫项目中运行的爬虫工程信息 # print(scrapyd.list_versions('default'))# 获取爬虫项目中的版本 scrapyd.schedule('default', 'hotdaily') # 这里是启动爬虫 return Response("ok") else: return Response("failed")
def schedule(): scrapyd = ScrapydAPI("http://localhost:6800") projects = scrapyd.list_projects() print( projects) list_start = [] list_end = [] while(True): print ('start') jobids = {} for project in projects: list_start.append(gevent.spawn(start, scrapyd, project)) donelist = gevent.joinall(list_start) [jobids.update(i.value) for i in donelist] print ('wait') time.sleep(120) print ('cancal') for project,job in jobids.items(): list_end.append(gevent.spawn(end, scrapyd, project, job)) time.sleep(300) print ('restart')
from scrapyd_api import ScrapydAPI scrapyd = ScrapydAPI('http://139.224.233.92:6800') #egg = open('some_egg.egg', 'rb') #scrapyd.delete_version('zhihuuser','197ca468135511e9a5281dbc0d0a2d2c') print(scrapyd.list_projects()) #print(scrapyd.list_spiders('zhihuuser')) #print(scrapyd.job_status('zhihuuser','9d4d8078135511e9a5281dbc0d0a2d2c')) #scrapyd.delete_version('zhihuuser','9d4d8078135511e9a5281dbc0d0a2d2c') #scrapyd.schedule('zhihuuser', '9d4d8078135511e9a5281dbc0d0a2d2c')
class ScrapydProxy(SpiderServiceProxy): ''' 单个爬虫服务类 继承单个爬虫服务基类, 实现基类的功能 ''' def __init__(self, server): self.spider_status_name_dict = { SpiderStatus.PENDING: 'pending', SpiderStatus.RUNNING: 'running', SpiderStatus.FINISHED: 'finished' } super(ScrapydProxy, self).__init__(server) # super执行的是父类的方法 self.scrapyd_api = ScrapydAPI(self._scrapyd_url()) # 实例化ScrapydAPI def _scrapyd_url(self): return self.server # 得到scrapyd的url, 用到实现的get方法 def list_projects(self): """ 获取指定scrapyd上的所有工程列表,返回工程名字符串列表,而get_project_list返回的是对象 :return: """ # 获取scrapyd上的所有工程列表 return self.scrapyd_api.list_projects() def get_project_list(self): """ 功能: 获取所有的爬虫工程列表 :return: 返回工程对象列表 """ data = self.scrapyd_api.list_projects() # 获取scrapyd上的所有工程列表 result = [] if data: for project_name in data: project = Project() # 实例化工程对象 project.project_name = project_name result.append(project) return result def delete_project(self, project_name): """ 功能: scrapyd上删除指定工程 :param project_name: 工程名称 :return: """ try: return self.scrapyd_api.delete_project( project_name) # 返回状态, 工程存在, 删除后返回True except: return False def get_slave_spider_list(self, project_name): try: data = self.scrapyd_api.list_spiders( project_name) # 列出指定工程下所有的爬虫名称 return data if data else [] except: return [] def get_spider_list(self, project_name): """ 功能: 获取指定工程下的所有爬虫名称列表 :param project_name: 工程名称 :return: 返回爬虫实例对象列表 """ try: data = self.scrapyd_api.list_spiders( project_name) # 列出指定工程下所有的爬虫名称 result = [] if data: for spider_name in data: spider_instance = SpiderInstance() spider_instance.spider_name = spider_name result.append(spider_instance) return result except: return [] def get_daemon_status(self): pass def get_job_list(self, project_name, spider_status=None): """ 从scrapyd中获取一个爬虫项目下面的所有蜘蛛任务状态 :param project_name: 爬虫项目名称 :param spider_status: 蜘蛛状态, 默认为None, 返回所有状态, 若传入状态值, 则返回某个状态 :return: """ result = { SpiderStatus.PENDING: [], SpiderStatus.RUNNING: [], SpiderStatus.FINISHED: [] } try: data = self.scrapyd_api.list_jobs(project_name) if data: for _status in self.spider_status_name_dict.keys(): for item in data[self.spider_status_name_dict[_status]]: start_time, end_time = None, None if item.get('start_time'): start_time = datetime.datetime.strptime( item['start_time'], '%Y-%m-%d %H:%M:%S.%f') if item.get('end_time'): end_time = datetime.datetime.strptime( item['end_time'], '%Y-%m-%d %H:%M:%S.%f') result[_status].append( dict(id=item['id'], start_time=start_time, end_time=end_time)) return result if not spider_status else result[spider_status] except: return result def start_spider(self, project_name, spider_name): """ 功能:启动指定工程下的指定爬虫 :param project_name: 工程名称 :param spider_name: 爬虫名称 :return: 返回启动的爬虫的id, 启动不成功, 返回None """ data = self.scrapyd_api.schedule(project_name, spider_name, pro_name=project_name) return data if data else None def cancel_spider(self, project_name, job_id): """ 功能: 取消工程下的指定job :param project_name: 工程名称 str :param job_id: job_id str :return: 成功取消, 返回True, 否则返回False """ data = self.scrapyd_api.cancel(project_name, job_id) return data != None def deploy(self, project_name, file_path): """ 功能: 将上传的egg项目部署到scrapyd上 :param project_name: 工程名称 str :param file_path: egg文件路径 str :return: 成功返回字典型工程信息, 否则返回None """ egg = open(file_path, 'rb') version = int(time.time()) spider_num = self.scrapyd_api.add_version(project_name, int(time.time()), egg) egg.close() ret = { 'version': version, 'project': project_name, 'spiders': spider_num, 'node_name': socket.gethostname(), 'status': 'ok' if spider_num else 'error' } return str(ret) if spider_num else False def log_url(self, project_name, spider_name, job_id): """ 功能: 获取爬虫的日志 :param project_name: 工程名称 str :param spider_name: 爬虫名称 str :param job_id: job_id str :return: 返回log日志文件的url str """ return self._scrapyd_url() + '/logs/%s/%s/%s.log' % ( project_name, spider_name, job_id)
def project_list(request, id): if request.method == 'GET': client = Client.objects.get(id=id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) projects = scrapyd.list_projects() return HttpResponse(json.dumps(projects))
# delversion.json: 此接口用来删除项目的某个版本 curl http://120.27.34.25:6800/delversion.json -d project=weibo -d version=v1 # 这里需要一个参数project,即项目名称,还需一个参数version,即项目的版本 # 返回结果: {"status": "ok"} # status代表请求执行情况,删除成功 # delproject.json:用来删除某个项目 curl http://120.27.34.25:6800/delproject.json -d project=weibo # 这里需要一个参数project,即项目名称, # 返回结果: {"status": "ok"} # status代表请求执行情况,删除成功 # 以上接口是Scrapyd所有的接口,可以直接请求HTTP接口,即可控制项目的部署、启动、运行等操作 # 5.ScrapyAPI的使用:Scrapyd API库对这些接口做了一层封装,核心原理和HTTP接口请求方式并无二致,只不过Python封装后的库更便捷 # 建立Scrapy API: from scrapyd_api import ScrapydAPI scrapyd = ScrapydAPI('http://120.27.34.25:6800') # 调用它的方法来实现对应接口的操作,如部署的操作: egg = open('weibo.egg', 'rb') scrapyd.add_version('weibo', 'v1', egg) # 这样就可以将项目打包为egg文件,然后把本地打包的Egg项目部署到远程Scrapyd # Scrapyd API还实现了所有Scrapyd提供的API接口,名称都是相同的,参数也是相同的 # 调用list_projects方法即可列出Scrapyd中所有已部署的项目: scrapyd.list_projects() ['weibo', 'zhihu'] # 详细操作可参考官方文档:http://python-scrapyd-api.readthedocs.io/
def scrapyd_api_test(): scrapyd = ScrapydAPI('http://localhost:6800') print(scrapyd.list_projects())
class Scrapyd_Control(object): def __init__(self): scrapyd_url = input('请输入scrapyd地址: ') project = input('请输入项目名称: ') self.project = project self.scrapyd = ScrapydAPI(scrapyd_url) # 启动爬虫 def schedule(self): spider = input('请输入爬虫名称: ') return { 'project': self.project, 'spider': spider, 'jobid': self.scrapyd.schedule(self.project, spider) } start, run = schedule, schedule # 取消爬虫 def cancel(self): jobid = input('请粘贴要取消的爬虫jobid: ') return self.scrapyd.cancel(self.project, jobid) # 查看项目 def listprojects(self): return self.scrapyd.list_projects() # 查看爬虫 def listspiders(self): return self.scrapyd.list_spiders(self.project) # 列出所有jobs def listjobs(self): return self.scrapyd.list_jobs(self.project) # 查看job状态 def jobstatus(self): jobid = input('请粘贴要查看的jobid: ') return self.scrapyd.job_status(self.project, jobid) # 查看版本 def listversions(self): return self.scrapyd.list_versions(self.project) # 删除版本 def delversion(self): version_name = input('请粘贴要删除的版本: ') yes = input('是否确认删除该版本{},请输yes否则回车跳过删除\n'.format(version_name)) if yes == 'yes': return self.scrapyd.delete_version(self.project, version_name) else: pass # 删除项目 def delproject(self): yes = input('是否确认删除该项目{},请输yes否则回车跳过删除\n'.format(self.project)) if yes == 'yes': return self.scrapyd.delete_project(self.project) else: pass # 列出所有命令 def help(self): print(""" 启动爬虫 schedule|start|run 取消爬虫 cancel 查看项目 listprojects 查看爬虫 listspiders 列出所有jobs listjobs 查看job状态 jobstatus 查看版本 listversions 删除版本 delversion 删除项目 deleproject 列出所有命令 help """)