Ejemplo n.º 1
0
def project_deploy(request, client_id, project_name):
    """
    deploy project operation
    :param request: request object
    :param client_id: client id
    :param project_name: project name
    :return: json of deploy result
    """
    if request.method == 'POST':
        # get project folder
        path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER))
        project_path = join(path, project_name)
        # find egg file
        egg = find_egg(project_path)
        egg_file = open(join(project_path, egg), 'rb')
        # get client and project model
        client = Client.objects.get(id=client_id)
        project = Project.objects.get(name=project_name)
        # execute deploy operation
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        try:
            scrapyd.add_version(project_name, int(time.time()), egg_file.read())
            # update deploy info
            deployed_at = timezone.now()
            Deploy.objects.filter(client=client, project=project).delete()
            deploy, result = Deploy.objects.update_or_create(client=client, project=project, deployed_at=deployed_at,
                                                             description=project.description)
            return JsonResponse(model_to_dict(deploy))
        except Exception:
            return JsonResponse({'message': get_traceback()}, status=500)
Ejemplo n.º 2
0
def project_deploy(request, node_id, project_name):
    if request.method == 'POST':
        # get project folder
        path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER))
        project_path = join(path, project_name)
        # find egg file
        egg = find_egg(project_path)
        egg_file = open(join(project_path, egg), 'rb')
        # get node and project model
        node = Node.objects.get(id=node_id)
        project = Project.objects.get(spider_name=project_name)
        # execute deploy operation
        scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port))
        try:
            scrapyd.add_version(project_name, int(time.time()),
                                egg_file.read())
            # update deploy info
            deployed_at = datetime.datetime.now()
            deployed_at = deployed_at.strftime("%Y-%m-%d %H:%M:%S")
            Deploy.objects.filter(node=node, project=project).delete()
            deploy, result = Deploy.objects.update_or_create(
                node=node,
                project=project,
                deployed_at=deployed_at,
                description=project.spider_desc)
            return JsonResponse({'result': 1, "deploy": model_to_dict(deploy)})
        except Exception:
            return JsonResponse({'message': get_traceback()}, status=500)
Ejemplo n.º 3
0
def project_deploy(request, id, project):
    if request.method == 'GET':
        path = os.path.abspath(merge(os.getcwd(), PROJECTS_FOLDER))
        project_path = merge(path, project)
        egg = find_egg(project_path)
        egg_file = open(merge(project_path, egg), 'rb')
        deploy_version = time.time()
        
        client_model = Client.objects.get(id=id)
        project_model = Project.objects.get(name=project)
        Deploy.objects.filter(client=client_model, project=project_model).delete()
        deploy = Deploy.objects.update_or_create(client=client_model, project=project_model,
                                                 description=project_model.description)
        scrapyd = ScrapydAPI(scrapyd_url(client_model.ip, client_model.port))
        result = scrapyd.add_version(project, int(deploy_version), egg_file.read())
        return HttpResponse(result)
Ejemplo n.º 4
0
class ScrapyAgent(object):
    """ scrapy项目代理类 """
    def __init__(self, server_url):
        self.server_url = server_url
        self.scrapyd_api = ScrapydAPI(server_url)

    def __repr__(self):
        return '<ScrapyAgent %s>' % self.server_url

    @property
    def server(self):
        return self.server_url

    def list_projects(self):
        return self.scrapyd_api.list_projects()

    def del_project(self, project_name):
        try:
            return self.scrapyd_api.delete_project(project_name)
        except:
            return False

    def list_spiders(self, project_name):
        return self.scrapyd_api.list_spiders(project_name)

    def start_spider(self, project_name, spider_name):
        return self.scrapyd_api.schedule(project_name, spider_name)

    def cancel_spider(self, project_name, job_id):
        return self.scrapyd_api.cancel(project_name, job_id)

    def deploy(self, project_name: str, version: int,
               egg_byte: BinaryIO) -> "Dict or bool":
        spider_num = self.scrapyd_api.add_version(project_name, version,
                                                  egg_byte)
        return {
            'project': project_name,
            'version': version,
            'spiders': spider_num,
        } if spider_num else False

    def log_url(self, project_name, spider_name, job_id):
        return '{}/logs/{}/{}/{}'\
            .format(self.server_url, project_name, spider_name, job_id)

    def job_status(self, project_name, job_id):
        return self.scrapyd_api.job_status(project_name, job_id)
Ejemplo n.º 5
0
def deploy_project(project, client):
    path = get_run_path()
    path = '{path}/storage/{project}/'.format(path=path, project=project.name)
    egg = get_egg_info(project)
    if egg:
        file_path = '{path}/{egg}'.format(path=path, egg=egg.get('name'))
        egg_file = open(file_path, 'rb')
        deploy_version = date_format(time.time(), '%Y-%m-%d_%H_%M_%S')
        url = 'http://{ip}:{port}'.format(ip=client.ip, port=client.port)
        try:
            scrapyd = ScrapydAPI(url)
            egg_version = egg.get('version')
            result = scrapyd.add_version(project.name, deploy_version,
                                         egg_file.read())
            return result, deploy_version, egg_version
        except (ConnectionError, InvalidURL):
            return None, None, None
    else:
        return None, None, None
Ejemplo n.º 6
0
class ScrapydProxy(SpiderServiceProxy):
    '''
    单个爬虫服务类
    继承单个爬虫服务基类, 实现基类的功能
    '''
    def __init__(self, server):
        self.spider_status_name_dict = {
            SpiderStatus.PENDING: 'pending',
            SpiderStatus.RUNNING: 'running',
            SpiderStatus.FINISHED: 'finished'
        }
        super(ScrapydProxy, self).__init__(server)  # super执行的是父类的方法
        self.scrapyd_api = ScrapydAPI(self._scrapyd_url())  # 实例化ScrapydAPI

    def _scrapyd_url(self):
        return self.server  # 得到scrapyd的url, 用到实现的get方法

    def list_projects(self):
        """
        获取指定scrapyd上的所有工程列表,返回工程名字符串列表,而get_project_list返回的是对象
        :return:
        """
        # 获取scrapyd上的所有工程列表
        return self.scrapyd_api.list_projects()

    def get_project_list(self):
        """
        功能: 获取所有的爬虫工程列表
        :return: 返回工程对象列表
        """
        data = self.scrapyd_api.list_projects()  # 获取scrapyd上的所有工程列表
        result = []
        if data:
            for project_name in data:
                project = Project()  # 实例化工程对象
                project.project_name = project_name
                result.append(project)
        return result

    def delete_project(self, project_name):
        """
        功能: scrapyd上删除指定工程
        :param project_name: 工程名称
        :return:
        """
        try:
            return self.scrapyd_api.delete_project(
                project_name)  # 返回状态, 工程存在, 删除后返回True
        except:
            return False

    def get_slave_spider_list(self, project_name):
        try:
            data = self.scrapyd_api.list_spiders(
                project_name)  # 列出指定工程下所有的爬虫名称
            return data if data else []
        except:
            return []

    def get_spider_list(self, project_name):
        """
        功能: 获取指定工程下的所有爬虫名称列表
        :param project_name: 工程名称
        :return: 返回爬虫实例对象列表
        """
        try:
            data = self.scrapyd_api.list_spiders(
                project_name)  # 列出指定工程下所有的爬虫名称
            result = []
            if data:
                for spider_name in data:
                    spider_instance = SpiderInstance()
                    spider_instance.spider_name = spider_name
                    result.append(spider_instance)
            return result
        except:
            return []

    def get_daemon_status(self):
        pass

    def get_job_list(self, project_name, spider_status=None):
        """
        从scrapyd中获取一个爬虫项目下面的所有蜘蛛任务状态
        :param project_name: 爬虫项目名称
        :param spider_status:  蜘蛛状态, 默认为None, 返回所有状态, 若传入状态值, 则返回某个状态
        :return:
        """
        result = {
            SpiderStatus.PENDING: [],
            SpiderStatus.RUNNING: [],
            SpiderStatus.FINISHED: []
        }
        try:
            data = self.scrapyd_api.list_jobs(project_name)
            if data:
                for _status in self.spider_status_name_dict.keys():
                    for item in data[self.spider_status_name_dict[_status]]:
                        start_time, end_time = None, None
                        if item.get('start_time'):
                            start_time = datetime.datetime.strptime(
                                item['start_time'], '%Y-%m-%d %H:%M:%S.%f')
                        if item.get('end_time'):
                            end_time = datetime.datetime.strptime(
                                item['end_time'], '%Y-%m-%d %H:%M:%S.%f')
                        result[_status].append(
                            dict(id=item['id'],
                                 start_time=start_time,
                                 end_time=end_time))
            return result if not spider_status else result[spider_status]
        except:
            return result

    def start_spider(self, project_name, spider_name):
        """
        功能:启动指定工程下的指定爬虫
        :param project_name: 工程名称
        :param spider_name: 爬虫名称
        :return: 返回启动的爬虫的id, 启动不成功, 返回None
        """
        data = self.scrapyd_api.schedule(project_name,
                                         spider_name,
                                         pro_name=project_name)
        return data if data else None

    def cancel_spider(self, project_name, job_id):
        """
        功能: 取消工程下的指定job
        :param project_name: 工程名称 str
        :param job_id: job_id str
        :return: 成功取消, 返回True, 否则返回False
        """
        data = self.scrapyd_api.cancel(project_name, job_id)
        return data != None

    def deploy(self, project_name, file_path):
        """
        功能: 将上传的egg项目部署到scrapyd上
        :param project_name: 工程名称 str
        :param file_path: egg文件路径 str
        :return: 成功返回字典型工程信息, 否则返回None
        """
        egg = open(file_path, 'rb')
        version = int(time.time())
        spider_num = self.scrapyd_api.add_version(project_name,
                                                  int(time.time()), egg)
        egg.close()
        ret = {
            'version': version,
            'project': project_name,
            'spiders': spider_num,
            'node_name': socket.gethostname(),
            'status': 'ok' if spider_num else 'error'
        }
        return str(ret) if spider_num else False

    def log_url(self, project_name, spider_name, job_id):
        """
        功能: 获取爬虫的日志
        :param project_name: 工程名称 str
        :param spider_name: 爬虫名称 str
        :param job_id: job_id str
        :return: 返回log日志文件的url str
        """
        return self._scrapyd_url() + '/logs/%s/%s/%s.log' % (
            project_name, spider_name, job_id)
# delversion.json: 此接口用来删除项目的某个版本
curl http://120.27.34.25:6800/delversion.json -d  project=weibo -d version=v1
# 这里需要一个参数project,即项目名称,还需一个参数version,即项目的版本
# 返回结果:
{"status": "ok"}
# status代表请求执行情况,删除成功

# delproject.json:用来删除某个项目
curl http://120.27.34.25:6800/delproject.json -d project=weibo

# 这里需要一个参数project,即项目名称,
# 返回结果:
{"status": "ok"}
# status代表请求执行情况,删除成功
# 以上接口是Scrapyd所有的接口,可以直接请求HTTP接口,即可控制项目的部署、启动、运行等操作

# 5.ScrapyAPI的使用:Scrapyd API库对这些接口做了一层封装,核心原理和HTTP接口请求方式并无二致,只不过Python封装后的库更便捷
# 建立Scrapy API:
from scrapyd_api import ScrapydAPI
scrapyd = ScrapydAPI('http://120.27.34.25:6800')
# 调用它的方法来实现对应接口的操作,如部署的操作:
egg = open('weibo.egg', 'rb')
scrapyd.add_version('weibo', 'v1', egg)
# 这样就可以将项目打包为egg文件,然后把本地打包的Egg项目部署到远程Scrapyd
# Scrapyd API还实现了所有Scrapyd提供的API接口,名称都是相同的,参数也是相同的
# 调用list_projects方法即可列出Scrapyd中所有已部署的项目:
scrapyd.list_projects()
['weibo', 'zhihu']
# 详细操作可参考官方文档:http://python-scrapyd-api.readthedocs.io/
Ejemplo n.º 8
0
from scrapyd_api import ScrapydAPI
scrapyd = ScrapydAPI("http://127.0.0.1:6800")
egg = open("book.egg", "rb")
scrapyd.add_version("book", "V1", egg)
Ejemplo n.º 9
0
class Schedular:
    def __init__(self):
        self._scrapyd = None
        try:
            self._scrapyd = ScrapydAPI('http://{}:{}'.format(
                config['Scrapyd']['host'], config['Scrapyd']['port']))
        except KeyError as e:
            logger.error("{}: No such key exists - {}".format(
                class_fullname(e), str(e)))
        except Exception as e:
            logger.error("{}: Failed to create a scrapyd object - {}".format(
                class_fullname(e), str(e)))

    def addversion(self,
                   project,
                   version,
                   egg_filename='pricewatch_bot-0.0.1-py3.7.egg'):
        """ Scrapyd API: addversion - https://scrapyd.readthedocs.io/en/stable/api.html#addversion-json
        """
        if not self._scrapyd:
            logger.error(
                "No scrapyd object find. Unable to add a new version.")
            return None
        num_of_spiders = None
        try:
            with open(os.path.join(settings.APP_DIST_DIRPATH, egg_filename),
                      'rb') as egg:
                num_of_spiders = self._scrapyd.add_version(
                    project, version, egg)
        except FileNotFoundError as e:
            logger.error("{}: {}".format(class_fullname(e), str(e)))
        except ScrapydResponseError as e:
            logger.error("{}: Response error - {}".format(
                class_fullname(e), str(e)))
        except Exception as e:
            logger.error("{}: Failed to add a version - {}".format(
                class_fullname(e), str(e)))
        else:
            logger.info(
                "version '{}' for project '{}' added/updated - {} spider(s)".
                format(project, version, num_of_spiders))
            # call API to create a version
            response = requests.post(
                'http://{}:{}/api/schedule/version/'.format(
                    config['PriceWatchWeb']['host'],
                    config['PriceWatchWeb']['port']),
                json={
                    'project': project,
                    'version': version,
                    'status': settings.SCHEDULES_VERSION_STATUS_ADDED,
                    'added_at': str(datetime.now()),
                    'deleted_at': None,
                })
            if not response.ok:
                logger.error(
                    "{} HTTP Error: Failed to add a version - {} - {}".format(
                        response.status_code, response.reason, response.text))
        finally:
            return num_of_spiders

    def schedule(self, project, spider, **kwargs):
        if not self._scrapyd:
            logger.error("No scrapyd object find. Unable to schedule a job.")
            return None
        _jobid = str(uuid.uuid4())
        kwargs['jobid'] = _jobid  # a scrapyd parameter
        kwargs['job_id'] = _jobid  # passing to a spider
        try:
            _s = None  # scrapy settings in dict. eg {'DOWNLOAD_DELAY': 2}
            jobid = self._scrapyd.schedule(project,
                                           spider,
                                           settings=_s,
                                           **kwargs)
        except ScrapydResponseError as e:
            logger.error("{}: Response error - {}".format(
                class_fullname(e), str(e)))
        except Exception as e:
            logger.error("{}: Failed to schedule a job - {}".format(
                class_fullname(e), str(e)))
        else:
            if jobid != _jobid:
                logger.error(
                    "{}: Invalid jobid [enteredid vs returnedid] [{} vs {}] - {}"
                    .format(class_fullname(e), _jobid, jobid, str(e)))
            else:
                logger.info(
                    "new scheduled job '{}' for project '{}', spider '{}' has been set"
                    .format(jobid, project, spider))
                # call API to create a job
                response = requests.post(
                    'http://{}:{}/api/schedule/job/'.format(
                        config['PriceWatchWeb']['host'],
                        config['PriceWatchWeb']['port']),
                    json={
                        'job_id': jobid,
                        'project': project,
                        'spider': spider,
                        'version': kwargs.pop('_version', None),
                        'settings': _s,
                        'other_params': kwargs,
                        'status': settings.SCHEDULES_JOB_STATUS_PENDING,
                    })
                if not response.ok:
                    logger.error(
                        "{} HTTP Error: Failed to add a new job - {} - {}".
                        format(response.status_code, response.reason,
                               response.text))
        finally:
            return jobid

    def listjobs(self, project):
        if not self._scrapyd:
            logger.error("No scrapyd object find. Unable to list jobs.")
            return None
        jobs = None
        try:
            jobs = self._scrapyd.list_jobs(project)
        except ScrapydResponseError as e:
            logger.error("{}: Response error - {}".format(
                class_fullname(e), str(e)))
        except Exception as e:
            logger.error("{}: Failed to list jobs - {}".format(
                class_fullname(e), str(e)))
        else:
            logger.info("list of jobs for project '{}' - {}".format(
                project, str(jobs)))
            self._store_jobs(project, jobs)
        finally:
            return jobs

    def _store_jobs(self, project, jobs):
        """ parse jobs and store information into db
        """
        if all(_j in jobs for _j in ['running', 'finished']):
            for x in jobs['running']:
                # call API to update a running job
                response = requests.put(
                    'http://{}:{}/api/schedule/job/{}/'.format(
                        config['PriceWatchWeb']['host'],
                        config['PriceWatchWeb']['port'], x['id']),
                    json={
                        'job_id': x['id'],
                        'project': project,
                        'spider': x['spider'],
                        'start_time': x['start_time'],
                        'status': settings.SCHEDULES_JOB_STATUS_RUNNING,
                    })
                if not response.ok:
                    logger.error(
                        "{} HTTP Error: Failed to update a running job - {} - {}"
                        .format(response.status_code, response.reason,
                                response.text))
            for x in jobs['finished']:
                # call API to update a finished job
                response = requests.put(
                    'http://{}:{}/api/schedule/job/{}/'.format(
                        config['PriceWatchWeb']['host'],
                        config['PriceWatchWeb']['port'], x['id']),
                    json={
                        'job_id': x['id'],
                        'project': project,
                        'spider': x['spider'],
                        'start_time': x['start_time'],
                        'end_time': x['end_time'],
                        'status': settings.SCHEDULES_JOB_STATUS_FINISHED,
                    })
                if not response.ok:
                    logger.error(
                        "{} HTTP Error: Failed to update a finished job - {} - {}"
                        .format(response.status_code, response.reason,
                                response.text))

    def delversion(self, project, version):
        """ delversion
        """
        if not self._scrapyd:
            logger.error("No scrapyd object find. Unable to delete version.")
            return False
        deleted = False
        try:
            deleted = self._scrapyd.delete_version(project, version)
        except ScrapydResponseError as e:
            logger.error("{}: Response error - {}".format(
                class_fullname(e), str(e)))
        except Exception as e:
            logger.error("{}: Failed to delete version - {}".format(
                class_fullname(e), str(e)))
        else:
            logger.info(
                "successfully deleted project '{}' version '{}'".format(
                    project, version))
            # update deleted version
            response = requests.put(
                'http://{}:{}/api/schedule/version/'.format(
                    config['PriceWatchWeb']['host'],
                    config['PriceWatchWeb']['port']),
                json={
                    'project': project,
                    'version': version,
                    'status': settings.SCHEDULES_VERSION_STATUS_DELETED,
                    'deleted_at': str(datetime.now()),
                })
            if not response.ok:
                logger.error(
                    "{} HTTP Error: Failed to update a deleted version - {} - {}"
                    .format(response.status_code, response.reason,
                            response.text))
        finally:
            return deleted

    def delproject(self, project):
        """ delproject
        """
        if not self._scrapyd:
            logger.error("No scrapyd object find. Unable to delete version.")
            return False
        deleted = False
        try:
            deleted = self._scrapyd.delete_project(project)
        except ScrapydResponseError as e:
            logger.error("{}: Response error - {}".format(
                class_fullname(e), str(e)))
        except Exception as e:
            logger.error("{}: Failed to delete project - {}".format(
                class_fullname(e), str(e)))
        else:
            logger.info("successfully deleted project '{}'".format(project))
            # update deleted project
            response = requests.put(
                'http://{}:{}/api/schedule/version/'.format(
                    config['PriceWatchWeb']['host'],
                    config['PriceWatchWeb']['port']),
                json={
                    'project': project,
                    'status': settings.SCHEDULES_VERSION_STATUS_DELETED,
                    'deleted_at': str(datetime.now()),
                })
            if not response.ok:
                logger.error(
                    "{} HTTP Error: Failed to update deleted project - {} - {}"
                    .format(response.status_code, response.reason,
                            response.text))
        finally:
            return deleted

    def close(self):
        self._scrapyd.client.close()