def project_version(request, client_id, project_name): """ get project deploy version :param request: request object :param client_id: client id :param project_name: project name :return: deploy version of project """ if request.method == 'GET': # get client and project model client = Client.objects.get(id=client_id) project = Project.objects.get(name=project_name) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) # if deploy info exists in db, return it if Deploy.objects.filter(client=client, project=project): deploy = Deploy.objects.get(client=client, project=project) # if deploy info does not exists in db, create deploy info else: try: versions = scrapyd.list_versions(project_name) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500) if len(versions) > 0: version = versions[-1] deployed_at = timezone.datetime.fromtimestamp(int(version), tz=pytz.timezone(TIME_ZONE)) else: deployed_at = None deploy, result = Deploy.objects.update_or_create(client=client, project=project, deployed_at=deployed_at) # return deploy json info return JsonResponse(model_to_dict(deploy))
def project_deploy(request, client_id, project_name): """ deploy project operation :param request: request object :param client_id: client id :param project_name: project name :return: json of deploy result """ if request.method == 'POST': # get project folder path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(path, project_name) # find egg file egg = find_egg(project_path) egg_file = open(join(project_path, egg), 'rb') # get client and project model client = Client.objects.get(id=client_id) project = Project.objects.get(name=project_name) # execute deploy operation scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: scrapyd.add_version(project_name, int(time.time()), egg_file.read()) # update deploy info deployed_at = timezone.now() Deploy.objects.filter(client=client, project=project).delete() deploy, result = Deploy.objects.update_or_create(client=client, project=project, deployed_at=deployed_at, description=project.description) return JsonResponse(model_to_dict(deploy)) except Exception: return JsonResponse({'message': get_traceback()}, status=500)
def twitterscraping(request): context = {} form = TweetForm() context['topic'] = "" context['max_tweets'] = 15 context['action'] = reverse('twitterscraping') context['firtsTimeLoad'] = 1 if request.method == 'POST': topic = request.POST.get('topic') max_tweet = request.POST.get('max_tweets') context['topic'] = topic context['max_tweets'] = int(max_tweet) historyKey = CommonHelper.CommonHelper.RandomIdGenerator() _UserCrawlHistory = UserCrawlHistory(Historykey=historyKey, KeyWord=topic) _UserCrawlHistory.save(using="SentimentAppDB") global SpiderTwitterJOBID scrapyd = ScrapydAPI('http://127.0.0.1:6800') SpiderTwitterJOBID = scrapyd.schedule(SCRAPYD_PROJECT_NAME, 'SpiderTwitter', historyKey=historyKey, count=max_tweet) context['firtsTimeLoad'] = 0 context['historyKey'] = historyKey return render(request, 'SentimentApp/twitterscraping.html', { 'result': context, 'form': form })
class ScrapydLoginFinderJob(object): def __init__(self, seed_url, username, password, db_name, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="login_finder"): scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port) self.scrapi = ScrapydAPI(scrapy_url) self.project = project self.spider = spider self.seed_url = seed_url self.username = username self.password = password self.db_name = db_name def schedule(self): self.job_id = self.scrapi.schedule(self.project, self.spider, seed_url = self.seed_url, username = self.username, password = self.password, db_name = self.db_name) return self.job_id def list_jobs(self): return self.scrapi.list_jobs(self.project) def get_state(self): try: self.job_id except: Exception("You must schedule a job before getting the state!") try: for job in self.scrapi.list_jobs(self.project)["running"]: print self.job_id, job["id"] if job["id"] == self.job_id: return "Running" for job in self.scrapi.list_jobs(self.project)["pending"]: print self.job_id, job["id"] if job["id"] == self.job_id: return "Pending" except: print "handled exception:" traceback.print_exc() return None return "Done" def block_until_done(self, timeout = 120): exec_time = 0 while 1: exec_time += 1 if exec_time == timeout: raise Exception("Timeout time reached for login_finder spider execution") time.sleep(1) state = self.get_state() if state == "Done": break
def _start_spider(domain, keywords=None): localhost = 'http://localhost:6800' scrapyd = ScrapydAPI(localhost) job_id = scrapyd.schedule('default', 'externalspider', started_by_domain=domain, keywords=keywords) return job_id
def remove_all_version(request, project, client_id): node = Node.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: versions = scrapyd.delete_project(project) return JsonResponse(versions) except ConnectionError: return JsonResponse({'message': 'Connet Error'}, status=500)
def __init__(self, server): self.spider_status_name_dict = { SpiderStatus.PENDING: 'pending', SpiderStatus.RUNNING: 'running', SpiderStatus.FINISHED: 'finished' } super(ScrapydProxy, self).__init__(server) # super执行的是父类的方法 self.scrapyd_api = ScrapydAPI(self._scrapyd_url()) # 实例化ScrapydAPI
def get_scrapyd(client): url = 'http://{ip}:{port}'.format(ip=client.ip, port=client.port) try: scrapyd = ScrapydAPI(url) result = scrapyd.list_projects() return scrapyd except (ConnectionError, InvalidURL): return False
def delete_version(project, client, version): url = 'http://{ip}:{port}'.format(ip=client.ip, port=client.port) try: scrapyd = ScrapydAPI(url) result = scrapyd.delete_version(project.name, version) return True if result else False except (ConnectionError, InvalidURL): return False
def task(): node = Node.objects.get(id=node_id) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: job = scrapyd.schedule(project_name, spider_name) return JsonResponse({'job': job}) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def remove_depody_spider(request, client_id, project, version_name): if request.method == 'POST': node = Node.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: spider = scrapyd.delete_version(project, version_name) return JsonResponse(spider) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def get_spider_version(request,project,client_id): client = Node.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip,client.port)) try: spiders = scrapyd.list_spiders(project) spiders = [{'name': spider, 'id': index + 1} for index, spider in enumerate(spiders)] return JsonResponse(spiders) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def debug(): scrapyd = ScrapydAPI("http://localhost:6800") job = scrapyd.schedule(project='sougouHotList', spider= 'sougouHotListSpider') #job = scrapyd.schedule(project='baiduHotList', spider='baiduHotListSpider') time.sleep(120) print ('开始关闭爬虫') scrapyd.cancel(project='sougouHotList', job=job) time.sleep(300) print ('关闭爬虫')
def task(request, project_name, spider_names, client_id): client = Node.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: for spider_name in spider_names: task = scrapyd.schedule(project_name, spider_name) return JsonResponse(task) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def client_status(request, id): if request.method == 'GET': client = Client.objects.get(id=id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: scrapyd.list_projects() return HttpResponse('1') except: return HttpResponse('0')
def get_scrapyd(client): """ get scrapyd of client :param client: client :return: scrapyd """ if not client.auth: return ScrapydAPI(scrapyd_url(client.ip, client.port)) return ScrapydAPI(scrapyd_url(client.ip, client.port), auth=(client.username, client.password))
def version(client, project): url = 'http://{ip}:{port}'.format(ip=client.ip, port=client.port) try: scrapyd = ScrapydAPI(url) versions = scrapyd.list_versions(project) if (len(versions) > 0): return versions[-1] return '' except (ConnectionError, InvalidURL, UnicodeError): return ''
def spider_list(request, id, project): if request.method == 'GET': client = Client.objects.get(id=id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) spiders = scrapyd.list_spiders(project) spiders = [{ 'name': spider, 'id': index + 1 } for index, spider in enumerate(spiders)] return HttpResponse(json.dumps(spiders))
def get_project_version(request,project,client_id): if request.method == 'GET': client = Node.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip,client.port)) try: versions = scrapyd.list_versions(project) versions = [{'name': version, 'id': index + 1} for index, version in enumerate(versions)] return JsonResponse(versions) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def post(self, request): """ :param request: 启动爬虫的请求参数 :return: 爬虫启动是否成功 """ data = request.data spider_name = data.get("spider_name") spider_type = data.get("spider_type") # print(spider_name) # print(spider_type) if spider_type == "start": try: scrapyd = ScrapydAPI('http://localhost:6800') # 这里是去调用部署分布式爬虫 scrapyd.schedule('default', spider_name) # 这里是启动爬虫 except: return Response("failed") else: try: scrapyd = ScrapydAPI('http://localhost:6800') # 这里是去调用部署分布式爬虫 del_dict = scrapyd.list_jobs('default') # 这里是启动爬虫 # print(scrapyd.list_jobs('default')) del_jobs = [] for k in ["pending", "running"]: # print(del_dict[k]) for item in del_dict[k]: if item.get("spider") == spider_name: del_jobs.append(item.get("id")) for job_id in del_jobs: scrapyd.cancel('default', job_id) # print(del_jobs) except: return Response("failed") return Response("ok")
def __init__(self): self._scrapyd = None try: self._scrapyd = ScrapydAPI('http://{}:{}'.format( config['Scrapyd']['host'], config['Scrapyd']['port'])) except KeyError as e: logger.error("{}: No such key exists - {}".format( class_fullname(e), str(e))) except Exception as e: logger.error("{}: Failed to create a scrapyd object - {}".format( class_fullname(e), str(e)))
def delete_project(project, url=DEFAULT_URL): """ @param project: scrapy project name @param spider: spider name @param url: the url which target scrapyd daemon listens on @param settings: the settings dictionary To schedule a spider run: curl http://localhost:6800/schedule.json -d project=myproject -d spider=spider2 """ scrapyd = ScrapydAPI(url) return scrapyd.delete_project(project)
def schedule_job(project, spider, url=DEFAULT_URL, settings={}, **kwargs): """ @param project: scrapy project name @param spider: spider name @param url: the url which target scrapyd daemon listens on @param settings: the settings dictionary To schedule a spider run: curl http://localhost:6800/schedule.json -d project=myproject -d spider=spider2 """ scrapyd = ScrapydAPI(url) return scrapyd.schedule(project, spider, settings, **kwargs)
def execute(*args): args = list(args) assert len(args) >= 1, u'É necessário informar pelo menos a spider' spider_name = args.pop(0) scrapy_url = 'http://localhost:6800' if args: scrapy_url = args.pop(0) scrapyd = ScrapydAPI(scrapy_url) scrapyd.schedule( get_project_name(), spider_name)
class ScrapydJob(object): def __init__(self, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="website_finder", screenshot_dir='/memex-pinterest/ui/static/images/screenshots'): scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port) self.scrapi = ScrapydAPI(scrapy_url) self.project = project self.spider = spider self.screenshot_dir = screenshot_dir def schedule(self, seed): if not self.screenshot_dir: raise Exception("Please set the screenshot path in the config before scheduling") self.job_id = self.scrapi.schedule(self.project, self.spider, seed_urls=seed, screenshot_dir=self.screenshot_dir) return self.job_id def schedule_keywords(self, phrases, use_splash=True): """ Schedule a Scrapyd job """ if not self.screenshot_dir: raise Exception("Please set the screenshot path in the config before scheduling") self.job_id = self.scrapi.schedule(self.project, self.spider, phrases=phrases, screenshot_dir=self.screenshot_dir, use_splash=int(use_splash) ) return self.job_id def list_jobs(self): return self.scrapi.list_jobs(self.project) def get_state(self, job_id): try: for job in self.scrapi.list_jobs(self.project)["running"]: print job_id, job["id"] if job["id"] == job_id: return "Running" for job in self.scrapi.list_jobs(self.project)["pending"]: print job_id, job["id"] if job["id"] == job_id: return "Pending" except Exception: print "handled exception:" traceback.print_exc() return None return "Done"
def job_list(request, id, project): if request.method == 'GET': client = Client.objects.get(id=id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) result = scrapyd.list_jobs(project) jobs = [] statuses = ['pending', 'running', 'finished'] for status in statuses: for job in result.get(status): job['status'] = status jobs.append(job) return HttpResponse(json.dumps(jobs))
def videoGetDetailsTaskSchedule(): platformInfo = { '哔哩哔哩视频': 'bilibiliDetailInfo', '西瓜视频': 'xiguaDetailedInfo', #因前面爬虫用了较好的名字,这个爬虫爬取西瓜系列较完整的信息 '今日头条': 'xiguaDetailedInfo', '今日头条_点赞数': 'xiguaDetailInfo', #仅补充点赞量 } batchCheckNums = 64 extraParams = { 'proxytype': '1', } extraParams = json.dumps(extraParams, ensure_ascii=False, separators=(',', ':')) for k, v in platformInfo.items(): if k == '哔哩哔哩视频' or k == '西瓜视频' or k == '今日头条': records = VideoDetailsData.objects.filter( platform__exact=k).filter(status__exact=2) elif k == '今日头条_点赞数': records = MovieOfflineData.objects.filter( platform__exact='今日头条').filter(ishz__exact=1).filter( detailStatus__exact=0).filter(tag__in=['待处理', '未下线']) else: records = [] spider = Spider.objects.get(Q(name__exact=v), Q(status__exact=0)) deployProject = spider.deployProject i = 0 scheduleServer = None j = m = 1 paramList = [] for record in records: if i % scrapydBatchSize == 0 and (j - 1) % batchCheckNums == 0: scheduleServer = getRunServer(deployProject) if scheduleServer: scrapyd = ScrapydAPI(scheduleServer, timeout=8) paramList.append({'id': record.id, 'targetUrl': record.url}) if j % batchCheckNums == 0 or m == len(records): params = json.dumps(paramList, ensure_ascii=False, separators=(',', ':')) print(params) status = scrapyd.schedule(project=deployProject, spider=spider.name, idTargetUrlList=params, extraParams=extraParams) print(status) paramList = [] i += 1 j += 1 m += 1
def webcrawling(request): context = {} form = WebCrawlForm() context['formUrlList'] = "http://www.newspapers71.com/\n\ http://www.ntvbd.com/\r\n\ http://www.prothom-alo.com/\r\n\ http://www.kalerkantho.com/\r\n\ http://www.bhorerkagoj.net/\r\n\ http://www.jaijaidinbd.com/\r\n\ http://www.amadershomoy.biz/beta/\r\n\ https://www.dailyinqilab.com/\r\n\ http://www.jugantor.com/\r\n\ http://www.dailynayadiganta.com/\r\n\ http://www.mzamin.com/" context['formKeyWordList'] = "এসিআই\r\n\ স্বপ্ন\r\n\ স্যাভলন" context['action'] = reverse('webcrawling') context['firtsTimeLoad'] = 1 if request.method == 'POST': urlText = request.POST.get('url') context['formUrlList'] = urlText keyWordList = request.POST.get('keyWord') context['formKeyWordList'] = keyWordList keyWord = keyWordList.splitlines() depth = request.POST.get('depth') context['depth'] = int(depth) historyKey = CommonHelper.CommonHelper.RandomIdGenerator() for key in keyWord: _UserCrawlHistory = UserCrawlHistory(Historykey=historyKey, KeyWord=key) _UserCrawlHistory.save(using='SentimentAppDB') global SpiderWebCrawlerJOBID scrapyd = ScrapydAPI('http://127.0.0.1:6800') SpiderWebCrawlerJOBID = scrapyd.schedule(SCRAPYD_PROJECT_NAME, 'SpiderWebCrawler', urls=urlText, depth=depth, historyKey=historyKey) context['historyKey'] = historyKey context['firtsTimeLoad'] = 0 return render(request, 'SentimentApp/webcrawling.html', { 'result': context, 'form': form })
def __init__( self, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="website_finder", screenshot_dir='/memex-pinterest/ui/static/images/screenshots'): scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port) self.scrapi = ScrapydAPI(scrapy_url) self.project = project self.spider = spider self.screenshot_dir = screenshot_dir
def check_parser_status(): """ task that checks current status (running, scheduled, finished or empty string if scraper with selected job id not founded) of all scrapes on the database :return: """ spiders = Scraper.objects.all() scrapyd = ScrapydAPI('http://scrapyd:6800') for spider in spiders: status = scrapyd.job_status(BOT_NAME, spider.job_id) spider.status = status spider.save(update_fields=['status'])
def AJX_BrowserCloseEvent(request): print("Browser close or reload event detected.") source = request.GET.get('source', None) global SpiderWebCrawlerJOBID global SpiderFacebookJOBID scrapyd = ScrapydAPI('http://127.0.0.1:6800') if source == "WebCrawl": try: print("Trying to stop web crawler scrapyd job : " + str(SpiderWebCrawlerJOBID)) scrapyd.cancel(SCRAPYD_PROJECT_NAME, SpiderWebCrawlerJOBID) except: print("Cant Find Web Crawler Active Job.") if source == "Facebook": try: print("Trying to stop facebook scrapyd job : " + str(SpiderFacebookJOBID)) scrapyd.cancel(SCRAPYD_PROJECT_NAME, SpiderFacebookJOBID) except: print("Cant Find Facebook Scraping Active Job.") if source == "Twitter": try: print("Trying to stop twitter scrapyd job : " + str(SpiderTwitterJOBID)) scrapyd.cancel(SCRAPYD_PROJECT_NAME, SpiderTwitterJOBID) except: print("Cant Find Twitter Scraping Active Job.") return JsonResponse(None, safe=False)
def commonSchedule(type, catagery, isChangeScheduleStatus): if type == 0: if catagery == 1: results = MovieCrawlState.objects.filter(task__exact=catagery) else: results = MovieCrawlState.objects.filter(manage__exact=0).filter( task__exact=catagery) elif type == 1: if catagery == 1: results = MusicCrawlState.objects.filter(task__exact=catagery) else: results = MusicCrawlState.objects.filter(manage__exact=0).filter( task__exact=catagery) results = results[:(len(settings.SCRAPYD_URLS) * scrapydBatchSize)] i = 0 scheduleServer = None for item in results: try: dictParam = json.loads(item.json) if item.json else {} except BaseException as e: print("json传入非法数据!") dictParam = {} searchWord, searchTaskId, suffixWords, spiderList, extraParams = setDeParams( dictParam) extraParams = json.dumps(extraParams, ensure_ascii=False, separators=(',', ':')) if i % scrapydBatchSize == 0: scheduleServer = getRunServer() if scheduleServer: if isChangeScheduleStatus: item.manage = 1 scrapyd = ScrapydAPI(scheduleServer, timeout=8) if len(searchWord): item.startNum = len(spiderList) for spider in spiderList: print(spider.deployProject, spider.name, searchWord, searchTaskId, suffixWords, extraParams) project = spider.deployProject scrapyd.schedule(project=project, spider=spider.name, keyword=searchWord, searchTaskId=searchTaskId, suffixWords=suffixWords, extraParams=extraParams) item.save() i += 1
def call_scrapyd_service(): """通过 api 操作爬虫 参考文档地址:https://pypi.python.org/pypi/python-scrapyd-api#downloads """ scrapyd = ScrapydAPI('http://localhost:6800') scrapyd.job_status('govbuyscrapy', '0c838fd4b9f111e6abcc14dda97ae760') # 查看指定爬虫任务执行状态 scrapyd.list_jobs('govbuyscrapy') # 查看爬虫任务列表 scrapyd.schedule('govbuyscrapy', 'govbuy_wan_shucheng') # 指定项目执行指定爬虫
def project_list(request, client_id): """ project deployed list on one client :param request: request object :param client_id: client id :return: json """ if request.method == 'GET': client = Client.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: projects = scrapyd.list_projects() return JsonResponse(projects) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def __init__(self, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="website_finder", screenshot_dir='/memex-pinterest/ui/static/images/screenshots'): scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port) self.scrapi = ScrapydAPI(scrapy_url) self.project = project self.spider = spider self.screenshot_dir = screenshot_dir
def __init__(self, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="website_finder", screenshot_dir=""): scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port) self.scrapi = ScrapydAPI(scrapy_url) self.project = project self.spider = spider self.screenshot_dir = screenshot_dir
def spider_list(request, client_id, project_name): """ get spider list from one client :param request: request Object :param client_id: client id :param project_name: project name :return: json """ if request.method == 'GET': client = Client.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: spiders = scrapyd.list_spiders(project_name) spiders = [{'name': spider, 'id': index + 1} for index, spider in enumerate(spiders)] return JsonResponse(spiders) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def job_cancel(request, client_id, project_name, job_id): """ cancel a job :param request: request object :param client_id: client id :param project_name: project name :param job_id: job id :return: json of cancel """ if request.method == 'GET': client = Client.objects.get(id=client_id) try: scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) result = scrapyd.cancel(project_name, job_id) return JsonResponse(result) except ConnectionError: return JsonResponse({'message': 'Connect Error'})
def spider_start(request, client_id, project_name, spider_name): """ start a spider :param request: request object :param client_id: client id :param project_name: project name :param spider_name: spider name :return: json """ if request.method == 'GET': client = Client.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: job = scrapyd.schedule(project_name, spider_name) return JsonResponse({'job': job}) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def __init__(self, seed_url, username, password, db_name, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="login_finder"): scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port) self.scrapi = ScrapydAPI(scrapy_url) self.project = project self.spider = spider self.seed_url = seed_url self.username = username self.password = password self.db_name = db_name
def job_list(request, client_id, project_name): """ get job list of project from one client :param request: request object :param client_id: client id :param project_name: project name :return: list of jobs """ if request.method == 'GET': client = Client.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: result = scrapyd.list_jobs(project_name) jobs = [] statuses = ['pending', 'running', 'finished'] for status in statuses: for job in result.get(status): job['status'] = status jobs.append(job) return JsonResponse(jobs) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def scheduler_job(): """ 每分钟检查一次定时任务 :return: """ models = Task.objects.all() for model in models: scheduler_at = model.scheduler_at updated_at = model.updated_at scheduler_at_time_stamp = scheduler_at * 60 updated_at_time_stamp = time.mktime(updated_at.timetuple()) if time.time() - updated_at_time_stamp > scheduler_at_time_stamp: client_id = model.client_id project_name = model.project_name spider_name = model.spider_name client = Client.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: job = scrapyd.schedule(project_name, spider_name) model.success = 1 except ConnectionError: model.success = 0 finally: model.save()
def __init__(self, name, spider_name, host, mongodb_credentials): self.server = ScrapydAPI(host) self.host_name = self._strip_host_name(host) self.birth_date = datetime.utcnow() self.name = name self.spider_name = spider_name self.alive = True client = pymongo.MongoClient(mongodb_credentials['server'], mongodb_credentials['port'], connectTimeoutMS=30000, socketTimeoutMS=None, socketKeepAlive=True) db = client[mongodb_credentials['database']] self.collection = db[mongodb_credentials['collection']]
from scrapyd_api import ScrapydAPI scrapyd = ScrapydAPI('http://127.0.0.1:6800') scrapyd.list_jobs('project_name')
class Overseer(object): """ Overseer facilitate the deployment process of local spiders to a remote scrapyd server Available methods: spawn_spiders Create spider and deploy them to remote scrapyd server get_status Report the current status of the remote scrapyd server """ DEFAULT_TYPE = 'sell' DEFAULT_VENDOR = 'None' def __init__(self, name, spider_name, host, mongodb_credentials): self.server = ScrapydAPI(host) self.host_name = self._strip_host_name(host) self.birth_date = datetime.utcnow() self.name = name self.spider_name = spider_name self.alive = True client = pymongo.MongoClient(mongodb_credentials['server'], mongodb_credentials['port'], connectTimeoutMS=30000, socketTimeoutMS=None, socketKeepAlive=True) db = client[mongodb_credentials['database']] self.collection = db[mongodb_credentials['collection']] def kill(self): self.alive = False return self.host_name def heartbeat(self): return self.alive def spawn_spiders(self, num_spiders=5, items_per_spider=100, **kwargs): type = kwargs.get('type', self.DEFAULT_TYPE) vendor = kwargs.get('vendor', self.DEFAULT_VENDOR) count = 0 while count < num_spiders: count += 1 self._spawn(vendor, type, items_per_spider) time.sleep(3) def get_status(self): """ Return: the number of running spiders the number of finished spiders the average time for one spider to finish """ status = self.server.list_jobs(self.name) running = status['running'] finished = status['finished'] finished_times = [self._time_diff_in_minute(job['end_time'], job['start_time']) for job in finished] avg_time = np.average(finished_times) Notification('{} - [{}] \t Running Spiders = {}, Finished Spiders = {}, Average Runtime = {}' .format(datetime.utcnow(), self.host_name, len(running), len(finished), avg_time ) .expandtabs(3) ).info() return len(running), len(finished), avg_time def _spawn(self, vendor, type, items_per_spider=100): # Get the tasks from the database tasks = self._get_tasks_from_database(vendor, type, items_per_spider) if not tasks: raise ValueError('There is no more task from the database!') links, property_ids = zip(*tasks) # Schedule the tasks with the remote scrapyd server job_id = self.server.schedule(self.name, self.spider_name, vendor=vendor, crawl_url=','.join(links), type=type) Notification('{} - [{}] \t Launch spider {}' .format(datetime.utcnow(), self.host_name, job_id) .expandtabs(3) ).success() # Clear the tasks from the database self._clear_tasks_from_database(vendor, type, property_ids) def _get_tasks_from_database(self, vendor, type, items_per_spider): cursor = self.collection \ .find({"last_crawled_date": None, "type": type, "vendor": vendor}) \ .sort("created_date", pymongo.ASCENDING) \ .limit(items_per_spider) tasks = [(item['link'], item['property_id']) for item in cursor] return tasks def _clear_tasks_from_database(self, vendor, type, property_ids): self.collection.update({"vendor": vendor, "type": type, "property_id": {"$in": property_ids}}, {"$set": {"last_crawled_date": datetime.utcnow()}}, multi=True, upsert=False) @staticmethod def _time_diff_in_minute(current, previous): return ((parser.parse(current) - parser.parse(previous)).seconds // 60) % 60 @staticmethod def _strip_host_name(host): return host.replace('http://', '').replace('.compute.amazonaws.com:6800', '')
def job_status(project, job, url=DEFAULT_URL): scrapyd = ScrapydAPI(url) return scrapyd.job_status(project, job)
def delete_version(project, version, url=DEFAULT_URL): scrapyd = ScrapydAPI(url) return scrapyd.delete_version(project, version)