コード例 #1
0
def client_status(request, id):
    if request.method == 'GET':
        client = Client.objects.get(id=id)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        try:
            scrapyd.list_projects()
            return HttpResponse('1')
        except:
            return HttpResponse('0')
コード例 #2
0
ファイル: views.py プロジェクト: theo1201/djangoscrapy
def ScrapydData(request):
    from scrapyd_api import ScrapydAPI
    scrapyd = ScrapydAPI('http://localhost:6800')
    settings = {'DOWNLOAD_DELAY': 2}
    # 运行一个爬虫
    scrapyd.schedule('project_name', 'spider_name', settings=settings)
    print(scrapyd.list_projects())
コード例 #3
0
ファイル: get_scrapyd.py プロジェクト: leeyis/Gerapy
def get_scrapyd(client):
    url = 'http://{ip}:{port}'.format(ip=client.ip, port=client.port)
    try:
        scrapyd = ScrapydAPI(url)
        result = scrapyd.list_projects()
        return scrapyd
    except (ConnectionError, InvalidURL):
        return False
コード例 #4
0
ファイル: views.py プロジェクト: xingxing17/Gerapy
def project_list(request, client_id):
    """
    project deployed list on one client
    :param request: request object
    :param client_id: client id
    :return: json
    """
    if request.method == 'GET':
        client = Client.objects.get(id=client_id)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        try:
            projects = scrapyd.list_projects()
            return JsonResponse(projects)
        except ConnectionError:
            return JsonResponse({'message': 'Connect Error'}, status=500)
コード例 #5
0
ファイル: ScrapyAgent.py プロジェクト: dayiwan/ScrapyKeeper
class ScrapyAgent(object):
    """ scrapy项目代理类 """
    def __init__(self, server_url):
        self.server_url = server_url
        self.scrapyd_api = ScrapydAPI(server_url)

    def __repr__(self):
        return '<ScrapyAgent %s>' % self.server_url

    @property
    def server(self):
        return self.server_url

    def list_projects(self):
        return self.scrapyd_api.list_projects()

    def del_project(self, project_name):
        try:
            return self.scrapyd_api.delete_project(project_name)
        except:
            return False

    def list_spiders(self, project_name):
        return self.scrapyd_api.list_spiders(project_name)

    def start_spider(self, project_name, spider_name):
        return self.scrapyd_api.schedule(project_name, spider_name)

    def cancel_spider(self, project_name, job_id):
        return self.scrapyd_api.cancel(project_name, job_id)

    def deploy(self, project_name: str, version: int,
               egg_byte: BinaryIO) -> "Dict or bool":
        spider_num = self.scrapyd_api.add_version(project_name, version,
                                                  egg_byte)
        return {
            'project': project_name,
            'version': version,
            'spiders': spider_num,
        } if spider_num else False

    def log_url(self, project_name, spider_name, job_id):
        return '{}/logs/{}/{}/{}'\
            .format(self.server_url, project_name, spider_name, job_id)

    def job_status(self, project_name, job_id):
        return self.scrapyd_api.job_status(project_name, job_id)
コード例 #6
0
ファイル: views.py プロジェクト: jinuoA/spider
def project_list(request, node_id):
    """
    project deployed list on one node
    :param request: request object
    :param node_id: node id
    :return: json
    """
    if request.method == 'GET':
        node = Node.objects.get(id=node_id)
        scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port))
        try:
            projects = scrapyd.list_projects()
            lis = []
            for project in projects:
                lis.append({'spider_name': project})
            return JsonResponse({'result': 1, 'lis': lis})
        except ConnectionError:
            return JsonResponse({'message': 'Connect Error'}, status=500)
コード例 #7
0
ファイル: spider.py プロジェクト: popstk/olddriver
class Spider(Resource):
    def __init__(self, url):
        self.scrapyd = ScrapydAPI(url)

    def get(self):
        data = []
        for p in self.scrapyd.list_projects():
            spiders = reduce_spiders(listjobs(self.scrapyd.list_jobs(p), p))
            data.extend(spiders)
        return data

    def post(self, name):
        p, s = name.split('.')
        jobs = self.scrapyd.list_jobs(p)
        for job in (jobs['running'] + jobs['pending']):
            if job['spider'] == s:
                return 'Already Running'
        return self.scrapyd.schedule(p, s)
コード例 #8
0
 def start_hot_spider(self, request):
     """
     启动每日热点的爬虫,需要验证admin权限
     :param request: user_type:用户的身份权限
     :return:
     """
     data = request.data
     user_type = data.get("user_type")
     if user_type == "1":  #admin权限,开启爬虫
         # run_hot()
         scrapyd = ScrapydAPI('http://localhost:6800')  # 这里是去调用部署分布式爬虫
         print(scrapyd.list_projects())  #获取爬虫项目名
         # print(scrapyd.list_spiders('default'))#获取爬虫项目中的爬虫工程名
         # print(scrapyd.list_jobs('default')) #获取爬虫项目中运行的爬虫工程信息
         # print(scrapyd.list_versions('default'))# 获取爬虫项目中的版本
         scrapyd.schedule('default', 'hotdaily')  # 这里是启动爬虫
         return Response("ok")
     else:
         return Response("failed")
コード例 #9
0
def schedule():
    scrapyd = ScrapydAPI("http://localhost:6800")
    projects = scrapyd.list_projects()
    print( projects)

    list_start = []
    list_end = []
    while(True):
        print ('start')
        jobids = {}

        for project in projects:
            list_start.append(gevent.spawn(start, scrapyd, project))
        donelist = gevent.joinall(list_start)

        [jobids.update(i.value) for i in donelist]
        print ('wait')
        time.sleep(120)
        print ('cancal')
        for project,job in jobids.items():
            list_end.append(gevent.spawn(end, scrapyd, project, job))
        time.sleep(300)
        print ('restart')
コード例 #10
0
from scrapyd_api import ScrapydAPI

scrapyd = ScrapydAPI('http://139.224.233.92:6800')

#egg = open('some_egg.egg', 'rb')

#scrapyd.delete_version('zhihuuser','197ca468135511e9a5281dbc0d0a2d2c')

print(scrapyd.list_projects())

#print(scrapyd.list_spiders('zhihuuser'))

#print(scrapyd.job_status('zhihuuser','9d4d8078135511e9a5281dbc0d0a2d2c'))

#scrapyd.delete_version('zhihuuser','9d4d8078135511e9a5281dbc0d0a2d2c')

#scrapyd.schedule('zhihuuser', '9d4d8078135511e9a5281dbc0d0a2d2c')
コード例 #11
0
class ScrapydProxy(SpiderServiceProxy):
    '''
    单个爬虫服务类
    继承单个爬虫服务基类, 实现基类的功能
    '''
    def __init__(self, server):
        self.spider_status_name_dict = {
            SpiderStatus.PENDING: 'pending',
            SpiderStatus.RUNNING: 'running',
            SpiderStatus.FINISHED: 'finished'
        }
        super(ScrapydProxy, self).__init__(server)  # super执行的是父类的方法
        self.scrapyd_api = ScrapydAPI(self._scrapyd_url())  # 实例化ScrapydAPI

    def _scrapyd_url(self):
        return self.server  # 得到scrapyd的url, 用到实现的get方法

    def list_projects(self):
        """
        获取指定scrapyd上的所有工程列表,返回工程名字符串列表,而get_project_list返回的是对象
        :return:
        """
        # 获取scrapyd上的所有工程列表
        return self.scrapyd_api.list_projects()

    def get_project_list(self):
        """
        功能: 获取所有的爬虫工程列表
        :return: 返回工程对象列表
        """
        data = self.scrapyd_api.list_projects()  # 获取scrapyd上的所有工程列表
        result = []
        if data:
            for project_name in data:
                project = Project()  # 实例化工程对象
                project.project_name = project_name
                result.append(project)
        return result

    def delete_project(self, project_name):
        """
        功能: scrapyd上删除指定工程
        :param project_name: 工程名称
        :return:
        """
        try:
            return self.scrapyd_api.delete_project(
                project_name)  # 返回状态, 工程存在, 删除后返回True
        except:
            return False

    def get_slave_spider_list(self, project_name):
        try:
            data = self.scrapyd_api.list_spiders(
                project_name)  # 列出指定工程下所有的爬虫名称
            return data if data else []
        except:
            return []

    def get_spider_list(self, project_name):
        """
        功能: 获取指定工程下的所有爬虫名称列表
        :param project_name: 工程名称
        :return: 返回爬虫实例对象列表
        """
        try:
            data = self.scrapyd_api.list_spiders(
                project_name)  # 列出指定工程下所有的爬虫名称
            result = []
            if data:
                for spider_name in data:
                    spider_instance = SpiderInstance()
                    spider_instance.spider_name = spider_name
                    result.append(spider_instance)
            return result
        except:
            return []

    def get_daemon_status(self):
        pass

    def get_job_list(self, project_name, spider_status=None):
        """
        从scrapyd中获取一个爬虫项目下面的所有蜘蛛任务状态
        :param project_name: 爬虫项目名称
        :param spider_status:  蜘蛛状态, 默认为None, 返回所有状态, 若传入状态值, 则返回某个状态
        :return:
        """
        result = {
            SpiderStatus.PENDING: [],
            SpiderStatus.RUNNING: [],
            SpiderStatus.FINISHED: []
        }
        try:
            data = self.scrapyd_api.list_jobs(project_name)
            if data:
                for _status in self.spider_status_name_dict.keys():
                    for item in data[self.spider_status_name_dict[_status]]:
                        start_time, end_time = None, None
                        if item.get('start_time'):
                            start_time = datetime.datetime.strptime(
                                item['start_time'], '%Y-%m-%d %H:%M:%S.%f')
                        if item.get('end_time'):
                            end_time = datetime.datetime.strptime(
                                item['end_time'], '%Y-%m-%d %H:%M:%S.%f')
                        result[_status].append(
                            dict(id=item['id'],
                                 start_time=start_time,
                                 end_time=end_time))
            return result if not spider_status else result[spider_status]
        except:
            return result

    def start_spider(self, project_name, spider_name):
        """
        功能:启动指定工程下的指定爬虫
        :param project_name: 工程名称
        :param spider_name: 爬虫名称
        :return: 返回启动的爬虫的id, 启动不成功, 返回None
        """
        data = self.scrapyd_api.schedule(project_name,
                                         spider_name,
                                         pro_name=project_name)
        return data if data else None

    def cancel_spider(self, project_name, job_id):
        """
        功能: 取消工程下的指定job
        :param project_name: 工程名称 str
        :param job_id: job_id str
        :return: 成功取消, 返回True, 否则返回False
        """
        data = self.scrapyd_api.cancel(project_name, job_id)
        return data != None

    def deploy(self, project_name, file_path):
        """
        功能: 将上传的egg项目部署到scrapyd上
        :param project_name: 工程名称 str
        :param file_path: egg文件路径 str
        :return: 成功返回字典型工程信息, 否则返回None
        """
        egg = open(file_path, 'rb')
        version = int(time.time())
        spider_num = self.scrapyd_api.add_version(project_name,
                                                  int(time.time()), egg)
        egg.close()
        ret = {
            'version': version,
            'project': project_name,
            'spiders': spider_num,
            'node_name': socket.gethostname(),
            'status': 'ok' if spider_num else 'error'
        }
        return str(ret) if spider_num else False

    def log_url(self, project_name, spider_name, job_id):
        """
        功能: 获取爬虫的日志
        :param project_name: 工程名称 str
        :param spider_name: 爬虫名称 str
        :param job_id: job_id str
        :return: 返回log日志文件的url str
        """
        return self._scrapyd_url() + '/logs/%s/%s/%s.log' % (
            project_name, spider_name, job_id)
コード例 #12
0
ファイル: views.py プロジェクト: zacharysells/Gerapy
def project_list(request, id):
    if request.method == 'GET':
        client = Client.objects.get(id=id)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        projects = scrapyd.list_projects()
        return HttpResponse(json.dumps(projects))
コード例 #13
0
# delversion.json: 此接口用来删除项目的某个版本
curl http://120.27.34.25:6800/delversion.json -d  project=weibo -d version=v1
# 这里需要一个参数project,即项目名称,还需一个参数version,即项目的版本
# 返回结果:
{"status": "ok"}
# status代表请求执行情况,删除成功

# delproject.json:用来删除某个项目
curl http://120.27.34.25:6800/delproject.json -d project=weibo

# 这里需要一个参数project,即项目名称,
# 返回结果:
{"status": "ok"}
# status代表请求执行情况,删除成功
# 以上接口是Scrapyd所有的接口,可以直接请求HTTP接口,即可控制项目的部署、启动、运行等操作

# 5.ScrapyAPI的使用:Scrapyd API库对这些接口做了一层封装,核心原理和HTTP接口请求方式并无二致,只不过Python封装后的库更便捷
# 建立Scrapy API:
from scrapyd_api import ScrapydAPI
scrapyd = ScrapydAPI('http://120.27.34.25:6800')
# 调用它的方法来实现对应接口的操作,如部署的操作:
egg = open('weibo.egg', 'rb')
scrapyd.add_version('weibo', 'v1', egg)
# 这样就可以将项目打包为egg文件,然后把本地打包的Egg项目部署到远程Scrapyd
# Scrapyd API还实现了所有Scrapyd提供的API接口,名称都是相同的,参数也是相同的
# 调用list_projects方法即可列出Scrapyd中所有已部署的项目:
scrapyd.list_projects()
['weibo', 'zhihu']
# 详细操作可参考官方文档:http://python-scrapyd-api.readthedocs.io/
コード例 #14
0
def scrapyd_api_test():
    scrapyd = ScrapydAPI('http://localhost:6800')
    print(scrapyd.list_projects())
コード例 #15
0
class Scrapyd_Control(object):
    def __init__(self):
        scrapyd_url = input('请输入scrapyd地址: ')
        project = input('请输入项目名称: ')
        self.project = project
        self.scrapyd = ScrapydAPI(scrapyd_url)

    # 启动爬虫
    def schedule(self):
        spider = input('请输入爬虫名称: ')
        return {
            'project': self.project,
            'spider': spider,
            'jobid': self.scrapyd.schedule(self.project, spider)
        }
    
    start, run = schedule, schedule

    # 取消爬虫
    def cancel(self):
        jobid = input('请粘贴要取消的爬虫jobid: ')
        return self.scrapyd.cancel(self.project, jobid)

    # 查看项目
    def listprojects(self):
        return self.scrapyd.list_projects()

    # 查看爬虫
    def listspiders(self):
        return self.scrapyd.list_spiders(self.project)

    # 列出所有jobs
    def listjobs(self):
        return self.scrapyd.list_jobs(self.project)

    # 查看job状态
    def jobstatus(self):
        jobid = input('请粘贴要查看的jobid: ')
        return self.scrapyd.job_status(self.project, jobid)

    # 查看版本
    def listversions(self):
        return self.scrapyd.list_versions(self.project)

    # 删除版本
    def delversion(self):
        version_name = input('请粘贴要删除的版本: ')
        yes = input('是否确认删除该版本{},请输yes否则回车跳过删除\n'.format(version_name))
        if yes == 'yes':
            return self.scrapyd.delete_version(self.project, version_name)
        else:
            pass

    # 删除项目
    def delproject(self):
        yes = input('是否确认删除该项目{},请输yes否则回车跳过删除\n'.format(self.project))
        if yes == 'yes':
            return self.scrapyd.delete_project(self.project)
        else:
            pass
        
    # 列出所有命令
    def help(self):
        print("""
        启动爬虫 schedule|start|run
        取消爬虫 cancel
        查看项目 listprojects
        查看爬虫 listspiders
        列出所有jobs listjobs 
        查看job状态 jobstatus
        查看版本 listversions
        删除版本 delversion
        删除项目 deleproject
        列出所有命令 help
        """)