def list_spiders(request, id, project): if request.method == 'GET': client = Client.objects.get(id=id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) spiders = scrapyd.list_spiders(project) spiders = [{'name': spider, 'id': index + 1} for index, spider in enumerate(spiders)] return HttpResponse(json.dumps(spiders))
def get_spider_version(request,project,client_id): client = Node.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip,client.port)) try: spiders = scrapyd.list_spiders(project) spiders = [{'name': spider, 'id': index + 1} for index, spider in enumerate(spiders)] return JsonResponse(spiders) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def task_spider_govbuy_content_spider(a=None, b=None): print '=' * 100 print 'now is %s' % datetime.datetime.now() scrapyd = ScrapydAPI('http://localhost:6800') spiders = scrapyd.list_spiders('govbuyscrapy') print 'spider has :', spiders run_spider_id = scrapyd.schedule('govbuyscrapy', 'govbuy_wan_timing_detail') # 详情页面爬虫 print 'spider runner id is :', run_spider_id scrapyd.job_status('govbuyscrapy', run_spider_id)
def task_spider_govbuy_list_by_spider_one_page(a=None, b=None): print '-' * 100 print 'now is %s' % datetime.datetime.now() scrapyd = ScrapydAPI('http://localhost:6800') spiders = scrapyd.list_spiders('govbuyscrapy') print 'spider has :', spiders run_spider_id = scrapyd.schedule('govbuyscrapy', 'govbuy_wan_timing_list') # 列表页面爬取 print 'spider runner id is :', run_spider_id scrapyd.job_status('govbuyscrapy', run_spider_id)
class ScrapyAgent(object): """ scrapy项目代理类 """ def __init__(self, server_url): self.server_url = server_url self.scrapyd_api = ScrapydAPI(server_url) def __repr__(self): return '<ScrapyAgent %s>' % self.server_url @property def server(self): return self.server_url def list_projects(self): return self.scrapyd_api.list_projects() def del_project(self, project_name): try: return self.scrapyd_api.delete_project(project_name) except: return False def list_spiders(self, project_name): return self.scrapyd_api.list_spiders(project_name) def start_spider(self, project_name, spider_name): return self.scrapyd_api.schedule(project_name, spider_name) def cancel_spider(self, project_name, job_id): return self.scrapyd_api.cancel(project_name, job_id) def deploy(self, project_name: str, version: int, egg_byte: BinaryIO) -> "Dict or bool": spider_num = self.scrapyd_api.add_version(project_name, version, egg_byte) return { 'project': project_name, 'version': version, 'spiders': spider_num, } if spider_num else False def log_url(self, project_name, spider_name, job_id): return '{}/logs/{}/{}/{}'\ .format(self.server_url, project_name, spider_name, job_id) def job_status(self, project_name, job_id): return self.scrapyd_api.job_status(project_name, job_id)
def spider_list(request, client_id, project_name): """ get spider list from one client :param request: request Object :param client_id: client id :param project_name: project name :return: json """ if request.method == 'GET': client = Client.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: spiders = scrapyd.list_spiders(project_name) spiders = [{'name': spider, 'id': index + 1} for index, spider in enumerate(spiders)] return JsonResponse(spiders) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
class Scraper: def __init__(self): self.client = ScrapydAPI("http://scrapyd:6800", timeout=10) self.project = 'default' def schedule_spider(self, spider_name: str): print(f"RUN SPIDER: {spider_name}") return self.client.schedule(self.project, spider_name) def cancel_job(self, job_id: str): return self.client.cancel(self.project, job_id) def get_status_of_job(self, job_id: str): return self.client.job_status(self.project, job_id) def get_all_jobs(self): return self.client.list_jobs(self.project) def get_all_spiders(self): return self.client.list_spiders(self.project)
def spider_list(request, node_id, spider_name): """ get spider list from one node :param request: request Object :param node_id: node id :param project_name: project name :return: json """ if request.method == 'GET': node = Node.objects.get(id=node_id) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: spiders = scrapyd.list_spiders(spider_name) spiders = [{ 'name': spider, 'id': index + 1 } for index, spider in enumerate(spiders)] return JsonResponse({"result": 1, "spiders": spiders}) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
class ScrapydProxy(SpiderServiceProxy): ''' 单个爬虫服务类 继承单个爬虫服务基类, 实现基类的功能 ''' def __init__(self, server): self.spider_status_name_dict = { SpiderStatus.PENDING: 'pending', SpiderStatus.RUNNING: 'running', SpiderStatus.FINISHED: 'finished' } super(ScrapydProxy, self).__init__(server) # super执行的是父类的方法 self.scrapyd_api = ScrapydAPI(self._scrapyd_url()) # 实例化ScrapydAPI def _scrapyd_url(self): return self.server # 得到scrapyd的url, 用到实现的get方法 def list_projects(self): """ 获取指定scrapyd上的所有工程列表,返回工程名字符串列表,而get_project_list返回的是对象 :return: """ # 获取scrapyd上的所有工程列表 return self.scrapyd_api.list_projects() def get_project_list(self): """ 功能: 获取所有的爬虫工程列表 :return: 返回工程对象列表 """ data = self.scrapyd_api.list_projects() # 获取scrapyd上的所有工程列表 result = [] if data: for project_name in data: project = Project() # 实例化工程对象 project.project_name = project_name result.append(project) return result def delete_project(self, project_name): """ 功能: scrapyd上删除指定工程 :param project_name: 工程名称 :return: """ try: return self.scrapyd_api.delete_project( project_name) # 返回状态, 工程存在, 删除后返回True except: return False def get_slave_spider_list(self, project_name): try: data = self.scrapyd_api.list_spiders( project_name) # 列出指定工程下所有的爬虫名称 return data if data else [] except: return [] def get_spider_list(self, project_name): """ 功能: 获取指定工程下的所有爬虫名称列表 :param project_name: 工程名称 :return: 返回爬虫实例对象列表 """ try: data = self.scrapyd_api.list_spiders( project_name) # 列出指定工程下所有的爬虫名称 result = [] if data: for spider_name in data: spider_instance = SpiderInstance() spider_instance.spider_name = spider_name result.append(spider_instance) return result except: return [] def get_daemon_status(self): pass def get_job_list(self, project_name, spider_status=None): """ 从scrapyd中获取一个爬虫项目下面的所有蜘蛛任务状态 :param project_name: 爬虫项目名称 :param spider_status: 蜘蛛状态, 默认为None, 返回所有状态, 若传入状态值, 则返回某个状态 :return: """ result = { SpiderStatus.PENDING: [], SpiderStatus.RUNNING: [], SpiderStatus.FINISHED: [] } try: data = self.scrapyd_api.list_jobs(project_name) if data: for _status in self.spider_status_name_dict.keys(): for item in data[self.spider_status_name_dict[_status]]: start_time, end_time = None, None if item.get('start_time'): start_time = datetime.datetime.strptime( item['start_time'], '%Y-%m-%d %H:%M:%S.%f') if item.get('end_time'): end_time = datetime.datetime.strptime( item['end_time'], '%Y-%m-%d %H:%M:%S.%f') result[_status].append( dict(id=item['id'], start_time=start_time, end_time=end_time)) return result if not spider_status else result[spider_status] except: return result def start_spider(self, project_name, spider_name): """ 功能:启动指定工程下的指定爬虫 :param project_name: 工程名称 :param spider_name: 爬虫名称 :return: 返回启动的爬虫的id, 启动不成功, 返回None """ data = self.scrapyd_api.schedule(project_name, spider_name, pro_name=project_name) return data if data else None def cancel_spider(self, project_name, job_id): """ 功能: 取消工程下的指定job :param project_name: 工程名称 str :param job_id: job_id str :return: 成功取消, 返回True, 否则返回False """ data = self.scrapyd_api.cancel(project_name, job_id) return data != None def deploy(self, project_name, file_path): """ 功能: 将上传的egg项目部署到scrapyd上 :param project_name: 工程名称 str :param file_path: egg文件路径 str :return: 成功返回字典型工程信息, 否则返回None """ egg = open(file_path, 'rb') version = int(time.time()) spider_num = self.scrapyd_api.add_version(project_name, int(time.time()), egg) egg.close() ret = { 'version': version, 'project': project_name, 'spiders': spider_num, 'node_name': socket.gethostname(), 'status': 'ok' if spider_num else 'error' } return str(ret) if spider_num else False def log_url(self, project_name, spider_name, job_id): """ 功能: 获取爬虫的日志 :param project_name: 工程名称 str :param spider_name: 爬虫名称 str :param job_id: job_id str :return: 返回log日志文件的url str """ return self._scrapyd_url() + '/logs/%s/%s/%s.log' % ( project_name, spider_name, job_id)
from scrapyd_api import ScrapydAPI scrapyd = ScrapydAPI('http://127.0.0.1:6800') # 获取上传爬虫目录 list_projects = scrapyd.list_projects() print(list_projects) list_spiders = scrapyd.list_spiders('wangYiStocks') # 查看爬虫任务 print(list_spiders) # 查看项目任务id list_jobs = scrapyd.list_jobs('wangYiStocks') print(list_jobs) cancel = scrapyd.cancel('wangYiStocks', '7c8be8661d4c11ea95d06c4b903122b5') print(cancel)
class Scrapyd_Control(object): def __init__(self): scrapyd_url = input('请输入scrapyd地址: ') project = input('请输入项目名称: ') self.project = project self.scrapyd = ScrapydAPI(scrapyd_url) # 启动爬虫 def schedule(self): spider = input('请输入爬虫名称: ') return { 'project': self.project, 'spider': spider, 'jobid': self.scrapyd.schedule(self.project, spider) } start, run = schedule, schedule # 取消爬虫 def cancel(self): jobid = input('请粘贴要取消的爬虫jobid: ') return self.scrapyd.cancel(self.project, jobid) # 查看项目 def listprojects(self): return self.scrapyd.list_projects() # 查看爬虫 def listspiders(self): return self.scrapyd.list_spiders(self.project) # 列出所有jobs def listjobs(self): return self.scrapyd.list_jobs(self.project) # 查看job状态 def jobstatus(self): jobid = input('请粘贴要查看的jobid: ') return self.scrapyd.job_status(self.project, jobid) # 查看版本 def listversions(self): return self.scrapyd.list_versions(self.project) # 删除版本 def delversion(self): version_name = input('请粘贴要删除的版本: ') yes = input('是否确认删除该版本{},请输yes否则回车跳过删除\n'.format(version_name)) if yes == 'yes': return self.scrapyd.delete_version(self.project, version_name) else: pass # 删除项目 def delproject(self): yes = input('是否确认删除该项目{},请输yes否则回车跳过删除\n'.format(self.project)) if yes == 'yes': return self.scrapyd.delete_project(self.project) else: pass # 列出所有命令 def help(self): print(""" 启动爬虫 schedule|start|run 取消爬虫 cancel 查看项目 listprojects 查看爬虫 listspiders 列出所有jobs listjobs 查看job状态 jobstatus 查看版本 listversions 删除版本 delversion 删除项目 deleproject 列出所有命令 help """)