Ejemplo n.º 1
0
class ScrapydLoginFinderJob(object):

    def __init__(self, seed_url, username, password, db_name, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="login_finder"):

        scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port)
        self.scrapi = ScrapydAPI(scrapy_url)
        self.project = project
        self.spider = spider
        self.seed_url = seed_url
        self.username = username
        self.password = password
        self.db_name = db_name

    def schedule(self):

        self.job_id = self.scrapi.schedule(self.project, self.spider, seed_url = self.seed_url, username = self.username, password = self.password, db_name = self.db_name)

        return self.job_id

    def list_jobs(self):
        return self.scrapi.list_jobs(self.project)

    def get_state(self):

        try:
            self.job_id
        except:
            Exception("You must schedule a job before getting the state!")

        try:
            for job in self.scrapi.list_jobs(self.project)["running"]:
                print self.job_id, job["id"]
                if job["id"] == self.job_id:
                    return "Running"

            for job in self.scrapi.list_jobs(self.project)["pending"]:
                print self.job_id, job["id"]
                if job["id"] == self.job_id:
                    return "Pending"

        except:
            print "handled exception:"
            traceback.print_exc()
            return None

        return "Done"
    
    def block_until_done(self, timeout = 120):
        
        exec_time = 0
        while 1:
            exec_time += 1
            if exec_time == timeout:
                raise Exception("Timeout time reached for login_finder spider execution")

            time.sleep(1)
            state = self.get_state()
            if state == "Done":
                break
Ejemplo n.º 2
0
def call_scrapyd_service():
    """通过 api 操作爬虫
    参考文档地址:https://pypi.python.org/pypi/python-scrapyd-api#downloads
    """
    scrapyd = ScrapydAPI('http://localhost:6800')
    scrapyd.job_status('govbuyscrapy', '0c838fd4b9f111e6abcc14dda97ae760')  # 查看指定爬虫任务执行状态
    scrapyd.list_jobs('govbuyscrapy')  # 查看爬虫任务列表
    scrapyd.schedule('govbuyscrapy', 'govbuy_wan_shucheng')  # 指定项目执行指定爬虫
Ejemplo n.º 3
0
class ScrapydJob(object):

    def __init__(self, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="website_finder", screenshot_dir='/memex-pinterest/ui/static/images/screenshots'):

        scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port)
        self.scrapi = ScrapydAPI(scrapy_url)
        self.project = project
        self.spider = spider
        self.screenshot_dir = screenshot_dir

    def schedule(self, seed):

        if not self.screenshot_dir:
            raise Exception("Please set the screenshot path in the config before scheduling")

        self.job_id = self.scrapi.schedule(self.project, self.spider, seed_urls=seed, screenshot_dir=self.screenshot_dir)

        return self.job_id

    def schedule_keywords(self, phrases, use_splash=True):
        """ Schedule a Scrapyd job """
        if not self.screenshot_dir:
            raise Exception("Please set the screenshot path in the config before scheduling")

        self.job_id = self.scrapi.schedule(self.project, self.spider,
            phrases=phrases,
            screenshot_dir=self.screenshot_dir,
            use_splash=int(use_splash)
        )
        return self.job_id

    def list_jobs(self):
        return self.scrapi.list_jobs(self.project)

    def get_state(self, job_id):

        try:
            for job in self.scrapi.list_jobs(self.project)["running"]:
                print job_id, job["id"]
                if job["id"] == job_id:
                    return "Running"

            for job in self.scrapi.list_jobs(self.project)["pending"]:
                print job_id, job["id"]
                if job["id"] == job_id:
                    return "Pending"

        except Exception:
            print "handled exception:"
            traceback.print_exc()
            return None

        return "Done"
Ejemplo n.º 4
0
    def post(self, request):
        """
        :param request: 启动爬虫的请求参数
        :return: 爬虫启动是否成功
        """
        data = request.data
        spider_name = data.get("spider_name")
        spider_type = data.get("spider_type")
        # print(spider_name)
        # print(spider_type)
        if spider_type == "start":
            try:
                scrapyd = ScrapydAPI('http://localhost:6800')  # 这里是去调用部署分布式爬虫
                scrapyd.schedule('default', spider_name)  # 这里是启动爬虫
            except:
                return Response("failed")
        else:
            try:
                scrapyd = ScrapydAPI('http://localhost:6800')  # 这里是去调用部署分布式爬虫
                del_dict = scrapyd.list_jobs('default')  # 这里是启动爬虫
                # print(scrapyd.list_jobs('default'))
                del_jobs = []
                for k in ["pending", "running"]:
                    # print(del_dict[k])
                    for item in del_dict[k]:
                        if item.get("spider") == spider_name:
                            del_jobs.append(item.get("id"))
                for job_id in del_jobs:
                    scrapyd.cancel('default', job_id)
                # print(del_jobs)

            except:
                return Response("failed")
        return Response("ok")
Ejemplo n.º 5
0
class Spider(Resource):
    def __init__(self, url):
        self.scrapyd = ScrapydAPI(url)

    def get(self):
        data = []
        for p in self.scrapyd.list_projects():
            spiders = reduce_spiders(listjobs(self.scrapyd.list_jobs(p), p))
            data.extend(spiders)
        return data

    def post(self, name):
        p, s = name.split('.')
        jobs = self.scrapyd.list_jobs(p)
        for job in (jobs['running'] + jobs['pending']):
            if job['spider'] == s:
                return 'Already Running'
        return self.scrapyd.schedule(p, s)
Ejemplo n.º 6
0
def jobs_remove(project_id):
    servers = agent.servers
    project = Project.query.filter(Project.id == project_id).first()
    db.session.execute('pragma foreign_keys=on')
    for job_instance in JobInstance.query.filter_by(project_id=project_id):
        db.session.delete(job_instance)
    db.session.commit()
    for server in servers:
        scrapyd = ScrapydAPI(server)
        for job in scrapyd.list_jobs(project.project_name)['pending']:
            jobid = job['id']
            prev_status = scrapyd.cancel(project.project_name,
                                         jobid,
                                         signal='KILL')
        for job in scrapyd.list_jobs(project.project_name)['running']:
            jobid = job['id']
            prev_status = scrapyd.cancel(project.project_name,
                                         jobid,
                                         signal='KILL')
    return redirect(request.referrer, code=302)
Ejemplo n.º 7
0
def job_list(request, id, project):
    if request.method == 'GET':
        client = Client.objects.get(id=id)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        result = scrapyd.list_jobs(project)
        jobs = []
        statuses = ['pending', 'running', 'finished']
        for status in statuses:
            for job in result.get(status):
                job['status'] = status
                jobs.append(job)
        return HttpResponse(json.dumps(jobs))
Ejemplo n.º 8
0
def list_jobs(project, url=DEFAULT_URL):
    """
    @param project: scrapy project name
    @param spider: spider name
    @param url: the url which target scrapyd daemon listens on
    @param settings: the settings dictionary

    To schedule a spider run:
        curl http://localhost:6800/schedule.json -d project=myproject -d spider=spider2
    """
    scrapyd = ScrapydAPI(url)
    return scrapyd.list_jobs(project)
    pass
Ejemplo n.º 9
0
def list_jobs(project,
              url=DEFAULT_URL):
    """
    @param project: scrapy project name
    @param spider: spider name
    @param url: the url which target scrapyd daemon listens on
    @param settings: the settings dictionary

    To schedule a spider run:
        curl http://localhost:6800/schedule.json -d project=myproject -d spider=spider2
    """
    scrapyd = ScrapydAPI(url)
    return scrapyd.list_jobs(project)
    pass
Ejemplo n.º 10
0
class Scraper:
    def __init__(self):
        self.client = ScrapydAPI("http://scrapyd:6800", timeout=10)
        self.project = 'default'

    def schedule_spider(self, spider_name: str):
        print(f"RUN SPIDER: {spider_name}")
        return self.client.schedule(self.project, spider_name)

    def cancel_job(self, job_id: str):
        return self.client.cancel(self.project, job_id)

    def get_status_of_job(self, job_id: str):
        return self.client.job_status(self.project, job_id)

    def get_all_jobs(self):
        return self.client.list_jobs(self.project)

    def get_all_spiders(self):
        return self.client.list_spiders(self.project)
Ejemplo n.º 11
0
def job_list(request, client_id, project_name):
    """
    get job list of project from one client
    :param request: request object
    :param client_id: client id
    :param project_name: project name
    :return: list of jobs
    """
    if request.method == 'GET':
        client = Client.objects.get(id=client_id)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        try:
            result = scrapyd.list_jobs(project_name)
            jobs = []
            statuses = ['pending', 'running', 'finished']
            for status in statuses:
                for job in result.get(status):
                    job['status'] = status
                    jobs.append(job)
            return JsonResponse(jobs)
        except ConnectionError:
            return JsonResponse({'message': 'Connect Error'}, status=500)
Ejemplo n.º 12
0
def job_list(request, node_id, project_name):
    """
    get job list of project from one node
    :param request: request object
    :param node_id: node id
    :param project_name: project name
    :return: list of jobs
    """
    if request.method == 'GET':
        node = Node.objects.get(id=node_id)
        scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port))
        try:
            result = scrapyd.list_jobs(project_name)
            jobs = []
            statuses = ['pending', 'running', 'finished']
            for status in statuses:
                for job in result.get(status):
                    job['status'] = status
                    jobs.append(job)
            return JsonResponse({"result": 1, "jobs": jobs})
        except ConnectionError:
            return JsonResponse({'message': 'Connect Error'}, status=500)
Ejemplo n.º 13
0
def getRunServer(deployProject='offlineCheckSpiders'):
    """
    :return: 返回pending和running状态任务数最少的机器,暂时按每个任务进行一次安排。如果超过最大任务数就不添加任务
    """
    servers = settings.SCRAPYD_URLS
    minTaskServer = None
    minTasks = -1
    for server in servers:
        try:
            scrapyd = ScrapydAPI(server, timeout=8)
            jobs = scrapyd.list_jobs(project=deployProject)
            taskNums = len(jobs.get('pending', [])) + len(
                jobs.get('running', []))
            print("server: %s Running tasks is %s" % (server, taskNums))
            if taskNums < scrapydBatchSize // 2:
                return server
            if (taskNums < minTasks or minTasks < 0):
                minTaskServer = server
                minTasks = taskNums
        except BaseException as e:
            print(" %s this server is not deployed, %s" % (server, e))

    return minTaskServer
Ejemplo n.º 14
0
class ScrapydProxy(SpiderServiceProxy):
    '''
    单个爬虫服务类
    继承单个爬虫服务基类, 实现基类的功能
    '''
    def __init__(self, server):
        self.spider_status_name_dict = {
            SpiderStatus.PENDING: 'pending',
            SpiderStatus.RUNNING: 'running',
            SpiderStatus.FINISHED: 'finished'
        }
        super(ScrapydProxy, self).__init__(server)  # super执行的是父类的方法
        self.scrapyd_api = ScrapydAPI(self._scrapyd_url())  # 实例化ScrapydAPI

    def _scrapyd_url(self):
        return self.server  # 得到scrapyd的url, 用到实现的get方法

    def list_projects(self):
        """
        获取指定scrapyd上的所有工程列表,返回工程名字符串列表,而get_project_list返回的是对象
        :return:
        """
        # 获取scrapyd上的所有工程列表
        return self.scrapyd_api.list_projects()

    def get_project_list(self):
        """
        功能: 获取所有的爬虫工程列表
        :return: 返回工程对象列表
        """
        data = self.scrapyd_api.list_projects()  # 获取scrapyd上的所有工程列表
        result = []
        if data:
            for project_name in data:
                project = Project()  # 实例化工程对象
                project.project_name = project_name
                result.append(project)
        return result

    def delete_project(self, project_name):
        """
        功能: scrapyd上删除指定工程
        :param project_name: 工程名称
        :return:
        """
        try:
            return self.scrapyd_api.delete_project(
                project_name)  # 返回状态, 工程存在, 删除后返回True
        except:
            return False

    def get_slave_spider_list(self, project_name):
        try:
            data = self.scrapyd_api.list_spiders(
                project_name)  # 列出指定工程下所有的爬虫名称
            return data if data else []
        except:
            return []

    def get_spider_list(self, project_name):
        """
        功能: 获取指定工程下的所有爬虫名称列表
        :param project_name: 工程名称
        :return: 返回爬虫实例对象列表
        """
        try:
            data = self.scrapyd_api.list_spiders(
                project_name)  # 列出指定工程下所有的爬虫名称
            result = []
            if data:
                for spider_name in data:
                    spider_instance = SpiderInstance()
                    spider_instance.spider_name = spider_name
                    result.append(spider_instance)
            return result
        except:
            return []

    def get_daemon_status(self):
        pass

    def get_job_list(self, project_name, spider_status=None):
        """
        从scrapyd中获取一个爬虫项目下面的所有蜘蛛任务状态
        :param project_name: 爬虫项目名称
        :param spider_status:  蜘蛛状态, 默认为None, 返回所有状态, 若传入状态值, 则返回某个状态
        :return:
        """
        result = {
            SpiderStatus.PENDING: [],
            SpiderStatus.RUNNING: [],
            SpiderStatus.FINISHED: []
        }
        try:
            data = self.scrapyd_api.list_jobs(project_name)
            if data:
                for _status in self.spider_status_name_dict.keys():
                    for item in data[self.spider_status_name_dict[_status]]:
                        start_time, end_time = None, None
                        if item.get('start_time'):
                            start_time = datetime.datetime.strptime(
                                item['start_time'], '%Y-%m-%d %H:%M:%S.%f')
                        if item.get('end_time'):
                            end_time = datetime.datetime.strptime(
                                item['end_time'], '%Y-%m-%d %H:%M:%S.%f')
                        result[_status].append(
                            dict(id=item['id'],
                                 start_time=start_time,
                                 end_time=end_time))
            return result if not spider_status else result[spider_status]
        except:
            return result

    def start_spider(self, project_name, spider_name):
        """
        功能:启动指定工程下的指定爬虫
        :param project_name: 工程名称
        :param spider_name: 爬虫名称
        :return: 返回启动的爬虫的id, 启动不成功, 返回None
        """
        data = self.scrapyd_api.schedule(project_name,
                                         spider_name,
                                         pro_name=project_name)
        return data if data else None

    def cancel_spider(self, project_name, job_id):
        """
        功能: 取消工程下的指定job
        :param project_name: 工程名称 str
        :param job_id: job_id str
        :return: 成功取消, 返回True, 否则返回False
        """
        data = self.scrapyd_api.cancel(project_name, job_id)
        return data != None

    def deploy(self, project_name, file_path):
        """
        功能: 将上传的egg项目部署到scrapyd上
        :param project_name: 工程名称 str
        :param file_path: egg文件路径 str
        :return: 成功返回字典型工程信息, 否则返回None
        """
        egg = open(file_path, 'rb')
        version = int(time.time())
        spider_num = self.scrapyd_api.add_version(project_name,
                                                  int(time.time()), egg)
        egg.close()
        ret = {
            'version': version,
            'project': project_name,
            'spiders': spider_num,
            'node_name': socket.gethostname(),
            'status': 'ok' if spider_num else 'error'
        }
        return str(ret) if spider_num else False

    def log_url(self, project_name, spider_name, job_id):
        """
        功能: 获取爬虫的日志
        :param project_name: 工程名称 str
        :param spider_name: 爬虫名称 str
        :param job_id: job_id str
        :return: 返回log日志文件的url str
        """
        return self._scrapyd_url() + '/logs/%s/%s/%s.log' % (
            project_name, spider_name, job_id)
Ejemplo n.º 15
0
class Overseer(object):
    """
    Overseer facilitate the deployment process of local spiders to a remote scrapyd server

    Available methods:
        spawn_spiders           Create spider and deploy them to remote scrapyd server
        get_status              Report the current status of the remote scrapyd server

    """

    DEFAULT_TYPE = 'sell'
    DEFAULT_VENDOR = 'None'

    def __init__(self, name, spider_name, host, mongodb_credentials):
        self.server = ScrapydAPI(host)
        self.host_name = self._strip_host_name(host)
        self.birth_date = datetime.utcnow()
        self.name = name
        self.spider_name = spider_name
        self.alive = True
        client = pymongo.MongoClient(mongodb_credentials['server'],
                                     mongodb_credentials['port'],
                                     connectTimeoutMS=30000,
                                     socketTimeoutMS=None,
                                     socketKeepAlive=True)

        db = client[mongodb_credentials['database']]
        self.collection = db[mongodb_credentials['collection']]

    def kill(self):
        self.alive = False
        return self.host_name

    def heartbeat(self):
        return self.alive

    def spawn_spiders(self, num_spiders=5, items_per_spider=100, **kwargs):
        type = kwargs.get('type', self.DEFAULT_TYPE)
        vendor = kwargs.get('vendor', self.DEFAULT_VENDOR)

        count = 0
        while count < num_spiders:
            count += 1
            self._spawn(vendor, type, items_per_spider)
            time.sleep(3)

    def get_status(self):
        """
         Return:
             the number of running spiders
             the number of finished spiders
             the average time for one spider to finish
        """
        status = self.server.list_jobs(self.name)
        running = status['running']
        finished = status['finished']
        finished_times = [self._time_diff_in_minute(job['end_time'], job['start_time']) for job in finished]
        avg_time = np.average(finished_times)

        Notification('{} - [{}] \t Running Spiders = {}, Finished Spiders = {}, Average Runtime = {}'
                     .format(datetime.utcnow(),
                             self.host_name,
                             len(running),
                             len(finished),
                             avg_time
                             )
                     .expandtabs(3)
                     ).info()

        return len(running), len(finished), avg_time

    def _spawn(self, vendor, type, items_per_spider=100):
        # Get the tasks from the database
        tasks = self._get_tasks_from_database(vendor, type, items_per_spider)
        if not tasks:
            raise ValueError('There is no more task from the database!')

        links, property_ids = zip(*tasks)

        # Schedule the tasks with the remote scrapyd server
        job_id = self.server.schedule(self.name, self.spider_name, vendor=vendor, crawl_url=','.join(links), type=type)

        Notification('{} - [{}] \t Launch spider {}'
                     .format(datetime.utcnow(),
                             self.host_name,
                             job_id)
                     .expandtabs(3)
                     ).success()

        # Clear the tasks from the database
        self._clear_tasks_from_database(vendor, type, property_ids)

    def _get_tasks_from_database(self, vendor, type, items_per_spider):
        cursor = self.collection \
                     .find({"last_crawled_date": None, "type": type, "vendor": vendor}) \
                     .sort("created_date", pymongo.ASCENDING) \
                     .limit(items_per_spider)

        tasks = [(item['link'], item['property_id']) for item in cursor]

        return tasks

    def _clear_tasks_from_database(self, vendor, type, property_ids):
        self.collection.update({"vendor": vendor, "type": type, "property_id": {"$in": property_ids}},
                               {"$set": {"last_crawled_date": datetime.utcnow()}},
                               multi=True,
                               upsert=False)

    @staticmethod
    def _time_diff_in_minute(current, previous):
        return ((parser.parse(current) - parser.parse(previous)).seconds // 60) % 60

    @staticmethod
    def _strip_host_name(host):
        return host.replace('http://', '').replace('.compute.amazonaws.com:6800', '')
Ejemplo n.º 16
0
class Schedular:
    def __init__(self):
        self._scrapyd = None
        try:
            self._scrapyd = ScrapydAPI('http://{}:{}'.format(
                config['Scrapyd']['host'], config['Scrapyd']['port']))
        except KeyError as e:
            logger.error("{}: No such key exists - {}".format(
                class_fullname(e), str(e)))
        except Exception as e:
            logger.error("{}: Failed to create a scrapyd object - {}".format(
                class_fullname(e), str(e)))

    def addversion(self,
                   project,
                   version,
                   egg_filename='pricewatch_bot-0.0.1-py3.7.egg'):
        """ Scrapyd API: addversion - https://scrapyd.readthedocs.io/en/stable/api.html#addversion-json
        """
        if not self._scrapyd:
            logger.error(
                "No scrapyd object find. Unable to add a new version.")
            return None
        num_of_spiders = None
        try:
            with open(os.path.join(settings.APP_DIST_DIRPATH, egg_filename),
                      'rb') as egg:
                num_of_spiders = self._scrapyd.add_version(
                    project, version, egg)
        except FileNotFoundError as e:
            logger.error("{}: {}".format(class_fullname(e), str(e)))
        except ScrapydResponseError as e:
            logger.error("{}: Response error - {}".format(
                class_fullname(e), str(e)))
        except Exception as e:
            logger.error("{}: Failed to add a version - {}".format(
                class_fullname(e), str(e)))
        else:
            logger.info(
                "version '{}' for project '{}' added/updated - {} spider(s)".
                format(project, version, num_of_spiders))
            # call API to create a version
            response = requests.post(
                'http://{}:{}/api/schedule/version/'.format(
                    config['PriceWatchWeb']['host'],
                    config['PriceWatchWeb']['port']),
                json={
                    'project': project,
                    'version': version,
                    'status': settings.SCHEDULES_VERSION_STATUS_ADDED,
                    'added_at': str(datetime.now()),
                    'deleted_at': None,
                })
            if not response.ok:
                logger.error(
                    "{} HTTP Error: Failed to add a version - {} - {}".format(
                        response.status_code, response.reason, response.text))
        finally:
            return num_of_spiders

    def schedule(self, project, spider, **kwargs):
        if not self._scrapyd:
            logger.error("No scrapyd object find. Unable to schedule a job.")
            return None
        _jobid = str(uuid.uuid4())
        kwargs['jobid'] = _jobid  # a scrapyd parameter
        kwargs['job_id'] = _jobid  # passing to a spider
        try:
            _s = None  # scrapy settings in dict. eg {'DOWNLOAD_DELAY': 2}
            jobid = self._scrapyd.schedule(project,
                                           spider,
                                           settings=_s,
                                           **kwargs)
        except ScrapydResponseError as e:
            logger.error("{}: Response error - {}".format(
                class_fullname(e), str(e)))
        except Exception as e:
            logger.error("{}: Failed to schedule a job - {}".format(
                class_fullname(e), str(e)))
        else:
            if jobid != _jobid:
                logger.error(
                    "{}: Invalid jobid [enteredid vs returnedid] [{} vs {}] - {}"
                    .format(class_fullname(e), _jobid, jobid, str(e)))
            else:
                logger.info(
                    "new scheduled job '{}' for project '{}', spider '{}' has been set"
                    .format(jobid, project, spider))
                # call API to create a job
                response = requests.post(
                    'http://{}:{}/api/schedule/job/'.format(
                        config['PriceWatchWeb']['host'],
                        config['PriceWatchWeb']['port']),
                    json={
                        'job_id': jobid,
                        'project': project,
                        'spider': spider,
                        'version': kwargs.pop('_version', None),
                        'settings': _s,
                        'other_params': kwargs,
                        'status': settings.SCHEDULES_JOB_STATUS_PENDING,
                    })
                if not response.ok:
                    logger.error(
                        "{} HTTP Error: Failed to add a new job - {} - {}".
                        format(response.status_code, response.reason,
                               response.text))
        finally:
            return jobid

    def listjobs(self, project):
        if not self._scrapyd:
            logger.error("No scrapyd object find. Unable to list jobs.")
            return None
        jobs = None
        try:
            jobs = self._scrapyd.list_jobs(project)
        except ScrapydResponseError as e:
            logger.error("{}: Response error - {}".format(
                class_fullname(e), str(e)))
        except Exception as e:
            logger.error("{}: Failed to list jobs - {}".format(
                class_fullname(e), str(e)))
        else:
            logger.info("list of jobs for project '{}' - {}".format(
                project, str(jobs)))
            self._store_jobs(project, jobs)
        finally:
            return jobs

    def _store_jobs(self, project, jobs):
        """ parse jobs and store information into db
        """
        if all(_j in jobs for _j in ['running', 'finished']):
            for x in jobs['running']:
                # call API to update a running job
                response = requests.put(
                    'http://{}:{}/api/schedule/job/{}/'.format(
                        config['PriceWatchWeb']['host'],
                        config['PriceWatchWeb']['port'], x['id']),
                    json={
                        'job_id': x['id'],
                        'project': project,
                        'spider': x['spider'],
                        'start_time': x['start_time'],
                        'status': settings.SCHEDULES_JOB_STATUS_RUNNING,
                    })
                if not response.ok:
                    logger.error(
                        "{} HTTP Error: Failed to update a running job - {} - {}"
                        .format(response.status_code, response.reason,
                                response.text))
            for x in jobs['finished']:
                # call API to update a finished job
                response = requests.put(
                    'http://{}:{}/api/schedule/job/{}/'.format(
                        config['PriceWatchWeb']['host'],
                        config['PriceWatchWeb']['port'], x['id']),
                    json={
                        'job_id': x['id'],
                        'project': project,
                        'spider': x['spider'],
                        'start_time': x['start_time'],
                        'end_time': x['end_time'],
                        'status': settings.SCHEDULES_JOB_STATUS_FINISHED,
                    })
                if not response.ok:
                    logger.error(
                        "{} HTTP Error: Failed to update a finished job - {} - {}"
                        .format(response.status_code, response.reason,
                                response.text))

    def delversion(self, project, version):
        """ delversion
        """
        if not self._scrapyd:
            logger.error("No scrapyd object find. Unable to delete version.")
            return False
        deleted = False
        try:
            deleted = self._scrapyd.delete_version(project, version)
        except ScrapydResponseError as e:
            logger.error("{}: Response error - {}".format(
                class_fullname(e), str(e)))
        except Exception as e:
            logger.error("{}: Failed to delete version - {}".format(
                class_fullname(e), str(e)))
        else:
            logger.info(
                "successfully deleted project '{}' version '{}'".format(
                    project, version))
            # update deleted version
            response = requests.put(
                'http://{}:{}/api/schedule/version/'.format(
                    config['PriceWatchWeb']['host'],
                    config['PriceWatchWeb']['port']),
                json={
                    'project': project,
                    'version': version,
                    'status': settings.SCHEDULES_VERSION_STATUS_DELETED,
                    'deleted_at': str(datetime.now()),
                })
            if not response.ok:
                logger.error(
                    "{} HTTP Error: Failed to update a deleted version - {} - {}"
                    .format(response.status_code, response.reason,
                            response.text))
        finally:
            return deleted

    def delproject(self, project):
        """ delproject
        """
        if not self._scrapyd:
            logger.error("No scrapyd object find. Unable to delete version.")
            return False
        deleted = False
        try:
            deleted = self._scrapyd.delete_project(project)
        except ScrapydResponseError as e:
            logger.error("{}: Response error - {}".format(
                class_fullname(e), str(e)))
        except Exception as e:
            logger.error("{}: Failed to delete project - {}".format(
                class_fullname(e), str(e)))
        else:
            logger.info("successfully deleted project '{}'".format(project))
            # update deleted project
            response = requests.put(
                'http://{}:{}/api/schedule/version/'.format(
                    config['PriceWatchWeb']['host'],
                    config['PriceWatchWeb']['port']),
                json={
                    'project': project,
                    'status': settings.SCHEDULES_VERSION_STATUS_DELETED,
                    'deleted_at': str(datetime.now()),
                })
            if not response.ok:
                logger.error(
                    "{} HTTP Error: Failed to update deleted project - {} - {}"
                    .format(response.status_code, response.reason,
                            response.text))
        finally:
            return deleted

    def close(self):
        self._scrapyd.client.close()
Ejemplo n.º 17
0
class ScrapydJob(object):
    def __init__(
            self,
            scrapyd_host="localhost",
            scrapyd_port="6800",
            project="default",
            spider="website_finder",
            screenshot_dir='/memex-pinterest/ui/static/images/screenshots'):

        scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port)
        self.scrapi = ScrapydAPI(scrapy_url)
        self.project = project
        self.spider = spider
        self.screenshot_dir = screenshot_dir

    def schedule(self, seed):

        if not self.screenshot_dir:
            raise Exception(
                "Please set the screenshot path in the config before scheduling"
            )

        self.job_id = self.scrapi.schedule(self.project,
                                           self.spider,
                                           seed_urls=seed,
                                           screenshot_dir=self.screenshot_dir)

        return self.job_id

    def schedule_keywords(self, phrases, use_splash=True):
        """ Schedule a Scrapyd job """
        if not self.screenshot_dir:
            raise Exception(
                "Please set the screenshot path in the config before scheduling"
            )

        self.job_id = self.scrapi.schedule(self.project,
                                           self.spider,
                                           phrases=phrases,
                                           screenshot_dir=self.screenshot_dir,
                                           use_splash=int(use_splash))
        return self.job_id

    def list_jobs(self):
        return self.scrapi.list_jobs(self.project)

    def get_state(self, job_id):

        try:
            for job in self.scrapi.list_jobs(self.project)["running"]:
                print job_id, job["id"]
                if job["id"] == job_id:
                    return "Running"

            for job in self.scrapi.list_jobs(self.project)["pending"]:
                print job_id, job["id"]
                if job["id"] == job_id:
                    return "Pending"

        except Exception:
            print "handled exception:"
            traceback.print_exc()
            return None

        return "Done"
Ejemplo n.º 18
0
from scrapyd_api import ScrapydAPI

scrapyd = ScrapydAPI('http://127.0.0.1:6800')
# 获取上传爬虫目录
list_projects = scrapyd.list_projects()
print(list_projects)
list_spiders = scrapyd.list_spiders('wangYiStocks')
# 查看爬虫任务
print(list_spiders)
# 查看项目任务id
list_jobs = scrapyd.list_jobs('wangYiStocks')
print(list_jobs)
cancel = scrapyd.cancel('wangYiStocks', '7c8be8661d4c11ea95d06c4b903122b5')
print(cancel)
Ejemplo n.º 19
0
class ScrapydLoginFinderJob(object):
    def __init__(self,
                 seed_url,
                 username,
                 password,
                 db_name,
                 scrapyd_host="localhost",
                 scrapyd_port="6800",
                 project="default",
                 spider="login_finder"):

        scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port)
        self.scrapi = ScrapydAPI(scrapy_url)
        self.project = project
        self.spider = spider
        self.seed_url = seed_url
        self.username = username
        self.password = password
        self.db_name = db_name

    def schedule(self):

        self.job_id = self.scrapi.schedule(self.project,
                                           self.spider,
                                           seed_url=self.seed_url,
                                           username=self.username,
                                           password=self.password,
                                           db_name=self.db_name)

        return self.job_id

    def list_jobs(self):
        return self.scrapi.list_jobs(self.project)

    def get_state(self):

        try:
            self.job_id
        except:
            Exception("You must schedule a job before getting the state!")

        try:
            for job in self.scrapi.list_jobs(self.project)["running"]:
                print self.job_id, job["id"]
                if job["id"] == self.job_id:
                    return "Running"

            for job in self.scrapi.list_jobs(self.project)["pending"]:
                print self.job_id, job["id"]
                if job["id"] == self.job_id:
                    return "Pending"

        except:
            print "handled exception:"
            traceback.print_exc()
            return None

        return "Done"

    def block_until_done(self, timeout=120):

        exec_time = 0
        while 1:
            exec_time += 1
            if exec_time == timeout:
                raise Exception(
                    "Timeout time reached for login_finder spider execution")

            time.sleep(1)
            state = self.get_state()
            if state == "Done":
                break
Ejemplo n.º 20
0
from scrapyd_api import ScrapydAPI
scrapyd = ScrapydAPI('http://127.0.0.1:6800')
scrapyd.list_jobs('project_name')
Ejemplo n.º 21
0
from scrapyd_api import ScrapydAPI

scrapyd = ScrapydAPI('http://localhost:6800')
scrapyd.list_jobs('blogSpider')
Ejemplo n.º 22
0
class Scrapyd_Control(object):
    def __init__(self):
        scrapyd_url = input('请输入scrapyd地址: ')
        project = input('请输入项目名称: ')
        self.project = project
        self.scrapyd = ScrapydAPI(scrapyd_url)

    # 启动爬虫
    def schedule(self):
        spider = input('请输入爬虫名称: ')
        return {
            'project': self.project,
            'spider': spider,
            'jobid': self.scrapyd.schedule(self.project, spider)
        }
    
    start, run = schedule, schedule

    # 取消爬虫
    def cancel(self):
        jobid = input('请粘贴要取消的爬虫jobid: ')
        return self.scrapyd.cancel(self.project, jobid)

    # 查看项目
    def listprojects(self):
        return self.scrapyd.list_projects()

    # 查看爬虫
    def listspiders(self):
        return self.scrapyd.list_spiders(self.project)

    # 列出所有jobs
    def listjobs(self):
        return self.scrapyd.list_jobs(self.project)

    # 查看job状态
    def jobstatus(self):
        jobid = input('请粘贴要查看的jobid: ')
        return self.scrapyd.job_status(self.project, jobid)

    # 查看版本
    def listversions(self):
        return self.scrapyd.list_versions(self.project)

    # 删除版本
    def delversion(self):
        version_name = input('请粘贴要删除的版本: ')
        yes = input('是否确认删除该版本{},请输yes否则回车跳过删除\n'.format(version_name))
        if yes == 'yes':
            return self.scrapyd.delete_version(self.project, version_name)
        else:
            pass

    # 删除项目
    def delproject(self):
        yes = input('是否确认删除该项目{},请输yes否则回车跳过删除\n'.format(self.project))
        if yes == 'yes':
            return self.scrapyd.delete_project(self.project)
        else:
            pass
        
    # 列出所有命令
    def help(self):
        print("""
        启动爬虫 schedule|start|run
        取消爬虫 cancel
        查看项目 listprojects
        查看爬虫 listspiders
        列出所有jobs listjobs 
        查看job状态 jobstatus
        查看版本 listversions
        删除版本 delversion
        删除项目 deleproject
        列出所有命令 help
        """)