def ScrapydData(request): from scrapyd_api import ScrapydAPI scrapyd = ScrapydAPI('http://localhost:6800') settings = {'DOWNLOAD_DELAY': 2} # 运行一个爬虫 scrapyd.schedule('project_name', 'spider_name', settings=settings) print(scrapyd.list_projects())
def url_check(request): url = request.POST.get('input', None).strip() try: product_id = ProductId.objects.get(url=url) return JsonResponse({ 'pid': product_id.pid, 'status': True }, status=status.HTTP_200_OK) except ProductId.DoesNotExist: domain = urlparse(url).netloc unique_id = str(uuid4()) settings = { 'unique_id': unique_id, 'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' } scrapyd = ScrapydAPI('http://localhost:6800') scrapyd.schedule('default', 'fab_crawler', settings=settings, url=url, domain=domain, uuid=unique_id) return JsonResponse({ 'status': False, 'uuid': unique_id }, status=status.HTTP_200_OK)
def inspection_new(request: HttpRequest): # TO-DO Use Django's form validation url, extraction = None, None form_url = "scraping_url" scrapyc = ScrapydAPI(conf_scrapyd["target"]) if form_url not in request.GET: return HttpResponseRedirect( reverse("url_inspector:error_new_inspection")) else: url = request.GET[form_url] try: URLValidator()(url) except ValidationError: return HttpResponseRedirect( reverse("url_inspector:error_new_inspection")) extraction = Extraction(url=url) extraction.save() try: scrapyc.schedule(conf_scrapyd["project"], conf_scrapyd["spider"], urls=url, django_urls=(url, ), django_ids=(extraction.id, )) except Exception as e: return HttpResponseRedirect( reverse("url_inspector:error_new_inspection")) return HttpResponseRedirect( reverse("url_inspector:inspection", kwargs={"pk": extraction.id}))
def post(self, request): """ :param request: 启动爬虫的请求参数 :return: 爬虫启动是否成功 """ data = request.data spider_name = data.get("spider_name") spider_type = data.get("spider_type") # print(spider_name) # print(spider_type) if spider_type == "start": try: scrapyd = ScrapydAPI('http://localhost:6800') # 这里是去调用部署分布式爬虫 scrapyd.schedule('default', spider_name) # 这里是启动爬虫 except: return Response("failed") else: try: scrapyd = ScrapydAPI('http://localhost:6800') # 这里是去调用部署分布式爬虫 del_dict = scrapyd.list_jobs('default') # 这里是启动爬虫 # print(scrapyd.list_jobs('default')) del_jobs = [] for k in ["pending", "running"]: # print(del_dict[k]) for item in del_dict[k]: if item.get("spider") == spider_name: del_jobs.append(item.get("id")) for job_id in del_jobs: scrapyd.cancel('default', job_id) # print(del_jobs) except: return Response("failed") return Response("ok")
def call_scrapyd_service(): """通过 api 操作爬虫 参考文档地址:https://pypi.python.org/pypi/python-scrapyd-api#downloads """ scrapyd = ScrapydAPI('http://localhost:6800') scrapyd.job_status('govbuyscrapy', '0c838fd4b9f111e6abcc14dda97ae760') # 查看指定爬虫任务执行状态 scrapyd.list_jobs('govbuyscrapy') # 查看爬虫任务列表 scrapyd.schedule('govbuyscrapy', 'govbuy_wan_shucheng') # 指定项目执行指定爬虫
def execute(*args): args = list(args) assert len(args) >= 1, u'É necessário informar pelo menos a spider' spider_name = args.pop(0) scrapy_url = 'http://localhost:6800' if args: scrapy_url = args.pop(0) scrapyd = ScrapydAPI(scrapy_url) scrapyd.schedule( get_project_name(), spider_name)
class ScrapydJob(object): def __init__(self, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="website_finder", screenshot_dir='/memex-pinterest/ui/static/images/screenshots'): scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port) self.scrapi = ScrapydAPI(scrapy_url) self.project = project self.spider = spider self.screenshot_dir = screenshot_dir def schedule(self, seed): if not self.screenshot_dir: raise Exception("Please set the screenshot path in the config before scheduling") self.job_id = self.scrapi.schedule(self.project, self.spider, seed_urls=seed, screenshot_dir=self.screenshot_dir) return self.job_id def schedule_keywords(self, phrases, use_splash=True): """ Schedule a Scrapyd job """ if not self.screenshot_dir: raise Exception("Please set the screenshot path in the config before scheduling") self.job_id = self.scrapi.schedule(self.project, self.spider, phrases=phrases, screenshot_dir=self.screenshot_dir, use_splash=int(use_splash) ) return self.job_id def list_jobs(self): return self.scrapi.list_jobs(self.project) def get_state(self, job_id): try: for job in self.scrapi.list_jobs(self.project)["running"]: print job_id, job["id"] if job["id"] == job_id: return "Running" for job in self.scrapi.list_jobs(self.project)["pending"]: print job_id, job["id"] if job["id"] == job_id: return "Pending" except Exception: print "handled exception:" traceback.print_exc() return None return "Done"
def commonSchedule(type, catagery, isChangeScheduleStatus): if type == 0: if catagery == 1: results = MovieCrawlState.objects.filter(task__exact=catagery) else: results = MovieCrawlState.objects.filter(manage__exact=0).filter( task__exact=catagery) elif type == 1: if catagery == 1: results = MusicCrawlState.objects.filter(task__exact=catagery) else: results = MusicCrawlState.objects.filter(manage__exact=0).filter( task__exact=catagery) results = results[:(len(settings.SCRAPYD_URLS) * scrapydBatchSize)] i = 0 scheduleServer = None for item in results: try: dictParam = json.loads(item.json) if item.json else {} except BaseException as e: print("json传入非法数据!") dictParam = {} searchWord, searchTaskId, suffixWords, spiderList, extraParams = setDeParams( dictParam) extraParams = json.dumps(extraParams, ensure_ascii=False, separators=(',', ':')) if i % scrapydBatchSize == 0: scheduleServer = getRunServer() if scheduleServer: if isChangeScheduleStatus: item.manage = 1 scrapyd = ScrapydAPI(scheduleServer, timeout=8) if len(searchWord): item.startNum = len(spiderList) for spider in spiderList: print(spider.deployProject, spider.name, searchWord, searchTaskId, suffixWords, extraParams) project = spider.deployProject scrapyd.schedule(project=project, spider=spider.name, keyword=searchWord, searchTaskId=searchTaskId, suffixWords=suffixWords, extraParams=extraParams) item.save() i += 1
def post(self, request): # works only with scrapyd server launched scrapyd = ScrapydAPI('http://localhost:6800') # launches the scrapers countries_task = scrapyd.schedule('default', 'countries') food_task = scrapyd.schedule('default', 'food') pop_task = scrapyd.schedule('default', 'population') poverty_task = scrapyd.schedule('default', 'poverty') # returns the unique id's of each scheduled task return JsonResponse({ 'countries_id': countries_task, 'food_id': food_task, 'population_id': pop_task, 'poverty_id': poverty_task })
def derivativeSearchWordTaskSchedule(): derivativeSearchWordSpidersInfo = { '西瓜头条系列': 'derivativeSearchWord', } extraParams = { 'proxytype': '1', } extraParams = json.dumps(extraParams, ensure_ascii=False, separators=(',', ':')) for k, v in derivativeSearchWordSpidersInfo.items(): if k == '西瓜头条系列': records = derivativeSearchWordData.objects.all() else: records = [] spider = Spider.objects.get(Q(name__exact=v), Q(status__exact=0)) deployProject = spider.deployProject for record in records: scheduleServer = getRunServer(deployProject) if scheduleServer: scrapyd = ScrapydAPI(scheduleServer, timeout=8) status = scrapyd.schedule(project=deployProject, spider=spider.name, dbId=record.id, keyword=record.name, extraParams=extraParams) print(status)
def add_domain_to_network(request, network_name): url = request.POST.get('url', None) if not url: logger.debug('check received wrong request method.') messages.error(request, 'URL is missing!') return redirect('start') domain_name = get_domain_from_url(url) obj = None if Domains.objects.filter(domain__icontains=domain_name).exists(): obj = Domains.objects.filter(domain__icontains=domain_name).first() else: obj = Domains.objects.create(domain=domain_name, url=url) localhost = 'http://localhost:6800' scrapyd = ScrapydAPI(localhost) job_id = scrapyd.schedule('default', 'externalspider', started_by_domain=obj.domain, keywords=[]) ExternalSpider.objects.create(domain=obj, job_id=job_id) obj.status = 'external_started' obj.save() nw = None if not Network.objects.filter(name=network_name).exists(): msg = 'Network: {} not found!'.format(network_name) messages.warning(request, msg) return redirect('start') nw = Network.objects.filter(name=network_name).first() nw.domains.add(obj) return redirect('network', network_name=nw.name)
def loadproxysite(request): siteurl = request.POST.get('siteurl', None) clicked_html = request.POST.get('clicked_html', None) if str(clicked_html) == "0": cc.reset() # deleting files on system click_dict.clear() click_dict[cc.counter] = [siteurl, FrameObject.filename] elif str(clicked_html) != "0": click_dict[cc.counter] = [siteurl, FrameObject.filename, clicked_html] cc.add() if siteurl is None: return HttpResponse("none") else: scrapyd = ScrapydAPI("http://localhost:6800") jid = scrapyd.schedule("feedbot", "sitesavespider", domain=siteurl) FrameObject.filename = siteurl.split("://")[1].replace("/", ".") + ".html" jsondata = { "filename": FrameObject.filename, "crawljob": jid, "siteurl": siteurl, "click_dict": click_dict } while (1): if "finished" in scrapyd.job_status("feedbot", jid): jsondata["filename"] = adapt_javascript(jsondata) return JsonResponse(jsondata) return HttpResponse("hello")
def scheduler_job(): """ 每分钟检查一次定时任务 :return: """ models = Task.objects.all() for model in models: scheduler_at = model.scheduler_at updated_at = model.updated_at scheduler_at_time_stamp = scheduler_at * 60 updated_at_time_stamp = time.mktime(updated_at.timetuple()) if time.time() - updated_at_time_stamp > scheduler_at_time_stamp: client_id = model.client_id project_name = model.project_name spider_name = model.spider_name client = Client.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: job = scrapyd.schedule(project_name, spider_name) model.job_id = job model.success = True except ConnectionError: model.success = False finally: model.save()
class ScrapyTasks(object): def __init__(self): self._scrapyd = ScrapydAPI(config.SCRAPYD_HOST) def launch_spider(self, project, spider): jobid = self._scrapyd.schedule(project, spider) return jobid
def twitterscraping(request): context = {} form = TweetForm() context['topic'] = "" context['max_tweets'] = 15 context['action'] = reverse('twitterscraping') context['firtsTimeLoad'] = 1 if request.method == 'POST': topic = request.POST.get('topic') max_tweet = request.POST.get('max_tweets') context['topic'] = topic context['max_tweets'] = int(max_tweet) historyKey = CommonHelper.CommonHelper.RandomIdGenerator() _UserCrawlHistory = UserCrawlHistory(Historykey=historyKey, KeyWord=topic) _UserCrawlHistory.save(using="SentimentAppDB") global SpiderTwitterJOBID scrapyd = ScrapydAPI('http://127.0.0.1:6800') SpiderTwitterJOBID = scrapyd.schedule(SCRAPYD_PROJECT_NAME, 'SpiderTwitter', historyKey=historyKey, count=max_tweet) context['firtsTimeLoad'] = 0 context['historyKey'] = historyKey return render(request, 'SentimentApp/twitterscraping.html', { 'result': context, 'form': form })
class Scraper(models.Model): site = models.OneToOneField(Site, on_delete=models.CASCADE) description = models.TextField(null=True, blank=True) file = models.FileField(upload_to=scraper_path, null=True, validators=[validate_py_extension]) task_id = models.CharField(null=True, blank=True, max_length=255) last_scraped = models.DateTimeField(null=True) class Meta: ordering = ['site__name'] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.scrapyd = ScrapydAPI("http://localhost:6800") def start(self): spider_name = "{}_{}_{}".format(self.site.name, self.site.gender, self.site.type) self.task_id = self.scrapyd.schedule("default", spider_name) self.save() def stop(self): self.scrapyd.cancel("default", self.task_id) self.save() def spider_status(self): if self.task_id: job_status = self.scrapyd.job_status('default', self.task_id) return job_status else: return "-"
def handle_request(request): # get keyword from input keyword = request.GET.get('keyword') # create an object of scrapyd API scrapyd = ScrapydAPI("http://localhost:6800") request_time = datetime.datetime.now() # create a job id job_id = scrapyd.schedule(project='quotes_scrape', spider='quotes_crawler', keyword=keyword, request_time=request_time) qry = f"select * from quotes where job_id = '{job_id}'" job_status = "running" values = [] # check for job status while job_status != "finished": job_status = scrapyd.job_status(project='quotes_scrape', job_id=job_id) if job_status == 'finished': # database connection con = pymysql.connect(host="localhost", user="******", passwd="", db="quotes_scrape", cursorclass=pymysql.cursors.DictCursor) cursor = con.cursor() # get records from database of particular database cursor.execute(qry) values = cursor.fetchall() print(values) else: sleep(1) return JsonResponse(data=values, safe=False)
class ProductChecker(models.Model): name = models.CharField(max_length=255, null=True, unique=True) description = models.TextField(null=True, blank=True) file = models.FileField(upload_to=scraper_path, null=True, validators=[validate_py_extension]) task_id = models.CharField(null=True, blank=True, max_length=255) last_scraped = models.DateTimeField(null=True) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.scrapyd = ScrapydAPI("http://178.128.105.215:6800") def start(self): self.task_id = self.scrapyd.schedule("default", self.name) self.save() def stop(self): self.scrapyd.cancel("default", self.task_id) self.save() def spider_status(self): if self.task_id: job_status = self.scrapyd.job_status('default', self.task_id) return job_status else: return "-"
class ScrapydLoginFinderJob(object): def __init__(self, seed_url, username, password, db_name, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="login_finder"): scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port) self.scrapi = ScrapydAPI(scrapy_url) self.project = project self.spider = spider self.seed_url = seed_url self.username = username self.password = password self.db_name = db_name def schedule(self): self.job_id = self.scrapi.schedule(self.project, self.spider, seed_url = self.seed_url, username = self.username, password = self.password, db_name = self.db_name) return self.job_id def list_jobs(self): return self.scrapi.list_jobs(self.project) def get_state(self): try: self.job_id except: Exception("You must schedule a job before getting the state!") try: for job in self.scrapi.list_jobs(self.project)["running"]: print self.job_id, job["id"] if job["id"] == self.job_id: return "Running" for job in self.scrapi.list_jobs(self.project)["pending"]: print self.job_id, job["id"] if job["id"] == self.job_id: return "Pending" except: print "handled exception:" traceback.print_exc() return None return "Done" def block_until_done(self, timeout = 120): exec_time = 0 while 1: exec_time += 1 if exec_time == timeout: raise Exception("Timeout time reached for login_finder spider execution") time.sleep(1) state = self.get_state() if state == "Done": break
def task(): node = Node.objects.get(id=node_id) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: job = scrapyd.schedule(project_name, spider_name) return JsonResponse({'job': job}) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def _start_spider(domain, keywords=None): localhost = 'http://localhost:6800' scrapyd = ScrapydAPI(localhost) job_id = scrapyd.schedule('default', 'externalspider', started_by_domain=domain, keywords=keywords) return job_id
def task(request, project_name, spider_names, client_id): client = Node.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: for spider_name in spider_names: task = scrapyd.schedule(project_name, spider_name) return JsonResponse(task) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def debug(): scrapyd = ScrapydAPI("http://localhost:6800") job = scrapyd.schedule(project='sougouHotList', spider= 'sougouHotListSpider') #job = scrapyd.schedule(project='baiduHotList', spider='baiduHotListSpider') time.sleep(120) print ('开始关闭爬虫') scrapyd.cancel(project='sougouHotList', job=job) time.sleep(300) print ('关闭爬虫')
def closed(self, reason): # join all the report summary strings together, # with a new line '\n' in between stats = self.crawler.stats.get_stats() stats = pprint.pformat(stats) s = '\n'.join(self.report_summary) # log the summary report as stored in 's' self.logger.info(s) # Store the log to our mysql db if it's not a test run # Open the log file and retrieve its content log_path = self.settings.get('LOG_FILE') file = open( '%s' % log_path, 'r') file_content = file.read() file.close() # Store the log to our mysql db # Start the db connection through the custom module. self.cursor.execute( "INSERT INTO `scrapy_logs` " "(`spider`, `test`,`log_date`, `log_file`, `stats`, `short_msg`, `long_msg`) " "VALUES (%s, %s, %s, %s, %s, %s, %s)", ('spider_new_products', self.test, time.time(), log_path, stats, s, file_content) ) self.conn.commit() # Count the products still need to be done self.count_new_products() # Close the db connection when done. self.conn.close() # Check if we need to rerun the script. self.logger.info( "We still have number of products to update left: %s", self.items_left) # Force the spider to stop when we cancel the job if reason is 'finished': if self.items_left >= 200 and int(self.limit) >= 51: #200 if (self.test == 1) or (self.test == '1'): scrapyd = ScrapydAPI('http://127.0.0.1:6800') scrapyd.schedule('scraper1', 'ScrapeNewProduct', source=self.source, test='1', limit=self.limit) else: self.logger.info ("Don't Reschedule because it's a test run") self.logger.info ("Don't Reschedule because limit is smaller then 50 or items is lower than 200")
def task_spider_govbuy_list_by_spider_one_page(a=None, b=None): print '-' * 100 print 'now is %s' % datetime.datetime.now() scrapyd = ScrapydAPI('http://localhost:6800') spiders = scrapyd.list_spiders('govbuyscrapy') print 'spider has :', spiders run_spider_id = scrapyd.schedule('govbuyscrapy', 'govbuy_wan_timing_list') # 列表页面爬取 print 'spider runner id is :', run_spider_id scrapyd.job_status('govbuyscrapy', run_spider_id)
def start_hot_spider(self, request): """ 启动每日热点的爬虫,需要验证admin权限 :param request: user_type:用户的身份权限 :return: """ data = request.data user_type = data.get("user_type") if user_type == "1": #admin权限,开启爬虫 # run_hot() scrapyd = ScrapydAPI('http://localhost:6800') # 这里是去调用部署分布式爬虫 print(scrapyd.list_projects()) #获取爬虫项目名 # print(scrapyd.list_spiders('default'))#获取爬虫项目中的爬虫工程名 # print(scrapyd.list_jobs('default')) #获取爬虫项目中运行的爬虫工程信息 # print(scrapyd.list_versions('default'))# 获取爬虫项目中的版本 scrapyd.schedule('default', 'hotdaily') # 这里是启动爬虫 return Response("ok") else: return Response("failed")
def task_spider_govbuy_content_spider(a=None, b=None): print '=' * 100 print 'now is %s' % datetime.datetime.now() scrapyd = ScrapydAPI('http://localhost:6800') spiders = scrapyd.list_spiders('govbuyscrapy') print 'spider has :', spiders run_spider_id = scrapyd.schedule('govbuyscrapy', 'govbuy_wan_timing_detail') # 详情页面爬虫 print 'spider runner id is :', run_spider_id scrapyd.job_status('govbuyscrapy', run_spider_id)
def schedule_job(project, spider, url=DEFAULT_URL, settings={}, **kwargs): """ @param project: scrapy project name @param spider: spider name @param url: the url which target scrapyd daemon listens on @param settings: the settings dictionary To schedule a spider run: curl http://localhost:6800/schedule.json -d project=myproject -d spider=spider2 """ scrapyd = ScrapydAPI(url) return scrapyd.schedule(project, spider, settings, **kwargs)
def webcrawling(request): context = {} form = WebCrawlForm() context['formUrlList'] = "http://www.newspapers71.com/\n\ http://www.ntvbd.com/\r\n\ http://www.prothom-alo.com/\r\n\ http://www.kalerkantho.com/\r\n\ http://www.bhorerkagoj.net/\r\n\ http://www.jaijaidinbd.com/\r\n\ http://www.amadershomoy.biz/beta/\r\n\ https://www.dailyinqilab.com/\r\n\ http://www.jugantor.com/\r\n\ http://www.dailynayadiganta.com/\r\n\ http://www.mzamin.com/" context['formKeyWordList'] = "এসিআই\r\n\ স্বপ্ন\r\n\ স্যাভলন" context['action'] = reverse('webcrawling') context['firtsTimeLoad'] = 1 if request.method == 'POST': urlText = request.POST.get('url') context['formUrlList'] = urlText keyWordList = request.POST.get('keyWord') context['formKeyWordList'] = keyWordList keyWord = keyWordList.splitlines() depth = request.POST.get('depth') context['depth'] = int(depth) historyKey = CommonHelper.CommonHelper.RandomIdGenerator() for key in keyWord: _UserCrawlHistory = UserCrawlHistory(Historykey=historyKey, KeyWord=key) _UserCrawlHistory.save(using='SentimentAppDB') global SpiderWebCrawlerJOBID scrapyd = ScrapydAPI('http://127.0.0.1:6800') SpiderWebCrawlerJOBID = scrapyd.schedule(SCRAPYD_PROJECT_NAME, 'SpiderWebCrawler', urls=urlText, depth=depth, historyKey=historyKey) context['historyKey'] = historyKey context['firtsTimeLoad'] = 0 return render(request, 'SentimentApp/webcrawling.html', { 'result': context, 'form': form })
def videoGetDetailsTaskSchedule(): platformInfo = { '哔哩哔哩视频': 'bilibiliDetailInfo', '西瓜视频': 'xiguaDetailedInfo', #因前面爬虫用了较好的名字,这个爬虫爬取西瓜系列较完整的信息 '今日头条': 'xiguaDetailedInfo', '今日头条_点赞数': 'xiguaDetailInfo', #仅补充点赞量 } batchCheckNums = 64 extraParams = { 'proxytype': '1', } extraParams = json.dumps(extraParams, ensure_ascii=False, separators=(',', ':')) for k, v in platformInfo.items(): if k == '哔哩哔哩视频' or k == '西瓜视频' or k == '今日头条': records = VideoDetailsData.objects.filter( platform__exact=k).filter(status__exact=2) elif k == '今日头条_点赞数': records = MovieOfflineData.objects.filter( platform__exact='今日头条').filter(ishz__exact=1).filter( detailStatus__exact=0).filter(tag__in=['待处理', '未下线']) else: records = [] spider = Spider.objects.get(Q(name__exact=v), Q(status__exact=0)) deployProject = spider.deployProject i = 0 scheduleServer = None j = m = 1 paramList = [] for record in records: if i % scrapydBatchSize == 0 and (j - 1) % batchCheckNums == 0: scheduleServer = getRunServer(deployProject) if scheduleServer: scrapyd = ScrapydAPI(scheduleServer, timeout=8) paramList.append({'id': record.id, 'targetUrl': record.url}) if j % batchCheckNums == 0 or m == len(records): params = json.dumps(paramList, ensure_ascii=False, separators=(',', ':')) print(params) status = scrapyd.schedule(project=deployProject, spider=spider.name, idTargetUrlList=params, extraParams=extraParams) print(status) paramList = [] i += 1 j += 1 m += 1
def run_spider(sender, instance, created, **kwargs): """ runs spider and if success updates its job id :param sender: :param instance: :param created: :param kwargs: :return: """ if created: scrapyd = ScrapydAPI('http://scrapyd:6800') job_id = scrapyd.schedule(BOT_NAME, PARSER_NAME) if job_id: instance.job_id = job_id instance.save(update_fields=['job_id'])
class ParsingService(BaseService): def __init__(self) -> NoReturn: self.spider = 'category', self.project = 'twenty_first_century', self.categories = CategoryModel.objects.all() self.scrapyd_api = ScrapydAPI(settings.PARSER_URL) def start(self) -> NoReturn: for category in self.categories: job_id = self.scrapyd_api.schedule( spider=self.spider, project=self.project, category=category.link, ) ScrapyModel.objects.create(category=category, job_id=job_id)
def handle(self, *args, **options): username = options['username'] feedname = options['feedname'] element_dict = {} for element in elements: element_dict[element] = get_element_info(element, username, feedname) print element_dict scrapyd = ScrapydAPI("http://localhost:6800") jid = scrapyd.schedule("feedbot", "rssspider", element_dict=json.dumps(element_dict)) print jid print json.dumps(element_dict) if "finished" in scrapyd.job_status("feedbot", jid): return 2
def spider_start(request, client_id, project_name, spider_name): """ start a spider :param request: request object :param client_id: client id :param project_name: project name :param spider_name: spider name :return: json """ if request.method == 'GET': client = Client.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: job = scrapyd.schedule(project_name, spider_name) return JsonResponse({'job': job}) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def scheduler_job(): """ 每分钟检查一次定时任务 :return: """ models = Task.objects.all() for model in models: scheduler_at = model.scheduler_at updated_at = model.updated_at scheduler_at_time_stamp = scheduler_at * 60 updated_at_time_stamp = time.mktime(updated_at.timetuple()) if time.time() - updated_at_time_stamp > scheduler_at_time_stamp: client_id = model.client_id project_name = model.project_name spider_name = model.spider_name client = Client.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: job = scrapyd.schedule(project_name, spider_name) model.success = 1 except ConnectionError: model.success = 0 finally: model.save()
class Overseer(object): """ Overseer facilitate the deployment process of local spiders to a remote scrapyd server Available methods: spawn_spiders Create spider and deploy them to remote scrapyd server get_status Report the current status of the remote scrapyd server """ DEFAULT_TYPE = 'sell' DEFAULT_VENDOR = 'None' def __init__(self, name, spider_name, host, mongodb_credentials): self.server = ScrapydAPI(host) self.host_name = self._strip_host_name(host) self.birth_date = datetime.utcnow() self.name = name self.spider_name = spider_name self.alive = True client = pymongo.MongoClient(mongodb_credentials['server'], mongodb_credentials['port'], connectTimeoutMS=30000, socketTimeoutMS=None, socketKeepAlive=True) db = client[mongodb_credentials['database']] self.collection = db[mongodb_credentials['collection']] def kill(self): self.alive = False return self.host_name def heartbeat(self): return self.alive def spawn_spiders(self, num_spiders=5, items_per_spider=100, **kwargs): type = kwargs.get('type', self.DEFAULT_TYPE) vendor = kwargs.get('vendor', self.DEFAULT_VENDOR) count = 0 while count < num_spiders: count += 1 self._spawn(vendor, type, items_per_spider) time.sleep(3) def get_status(self): """ Return: the number of running spiders the number of finished spiders the average time for one spider to finish """ status = self.server.list_jobs(self.name) running = status['running'] finished = status['finished'] finished_times = [self._time_diff_in_minute(job['end_time'], job['start_time']) for job in finished] avg_time = np.average(finished_times) Notification('{} - [{}] \t Running Spiders = {}, Finished Spiders = {}, Average Runtime = {}' .format(datetime.utcnow(), self.host_name, len(running), len(finished), avg_time ) .expandtabs(3) ).info() return len(running), len(finished), avg_time def _spawn(self, vendor, type, items_per_spider=100): # Get the tasks from the database tasks = self._get_tasks_from_database(vendor, type, items_per_spider) if not tasks: raise ValueError('There is no more task from the database!') links, property_ids = zip(*tasks) # Schedule the tasks with the remote scrapyd server job_id = self.server.schedule(self.name, self.spider_name, vendor=vendor, crawl_url=','.join(links), type=type) Notification('{} - [{}] \t Launch spider {}' .format(datetime.utcnow(), self.host_name, job_id) .expandtabs(3) ).success() # Clear the tasks from the database self._clear_tasks_from_database(vendor, type, property_ids) def _get_tasks_from_database(self, vendor, type, items_per_spider): cursor = self.collection \ .find({"last_crawled_date": None, "type": type, "vendor": vendor}) \ .sort("created_date", pymongo.ASCENDING) \ .limit(items_per_spider) tasks = [(item['link'], item['property_id']) for item in cursor] return tasks def _clear_tasks_from_database(self, vendor, type, property_ids): self.collection.update({"vendor": vendor, "type": type, "property_id": {"$in": property_ids}}, {"$set": {"last_crawled_date": datetime.utcnow()}}, multi=True, upsert=False) @staticmethod def _time_diff_in_minute(current, previous): return ((parser.parse(current) - parser.parse(previous)).seconds // 60) % 60 @staticmethod def _strip_host_name(host): return host.replace('http://', '').replace('.compute.amazonaws.com:6800', '')