def call_scrapyd_service(): """通过 api 操作爬虫 参考文档地址:https://pypi.python.org/pypi/python-scrapyd-api#downloads """ scrapyd = ScrapydAPI('http://localhost:6800') scrapyd.job_status('govbuyscrapy', '0c838fd4b9f111e6abcc14dda97ae760') # 查看指定爬虫任务执行状态 scrapyd.list_jobs('govbuyscrapy') # 查看爬虫任务列表 scrapyd.schedule('govbuyscrapy', 'govbuy_wan_shucheng') # 指定项目执行指定爬虫
def task_spider_govbuy_content_spider(a=None, b=None): print '=' * 100 print 'now is %s' % datetime.datetime.now() scrapyd = ScrapydAPI('http://localhost:6800') spiders = scrapyd.list_spiders('govbuyscrapy') print 'spider has :', spiders run_spider_id = scrapyd.schedule('govbuyscrapy', 'govbuy_wan_timing_detail') # 详情页面爬虫 print 'spider runner id is :', run_spider_id scrapyd.job_status('govbuyscrapy', run_spider_id)
def task_spider_govbuy_list_by_spider_one_page(a=None, b=None): print '-' * 100 print 'now is %s' % datetime.datetime.now() scrapyd = ScrapydAPI('http://localhost:6800') spiders = scrapyd.list_spiders('govbuyscrapy') print 'spider has :', spiders run_spider_id = scrapyd.schedule('govbuyscrapy', 'govbuy_wan_timing_list') # 列表页面爬取 print 'spider runner id is :', run_spider_id scrapyd.job_status('govbuyscrapy', run_spider_id)
def get(self, request): scrapyd = ScrapydAPI('http://localhost:6800') # receives the id's of each task countries_id = request.GET.get('countries_id', None) food_id = request.GET.get('food_id', None) population_id = request.GET.get('population_id', None) poverty_id = request.GET.get('poverty_id', None) # determine the status of each task and return dict jobs = { 'countries_status': scrapyd.job_status('default', countries_id), 'food_status': scrapyd.job_status('default', food_id), 'population_status': scrapyd.job_status('default', population_id), 'poverty_status': scrapyd.job_status('default', poverty_id) } return JsonResponse(jobs)
def loadproxysite(request): siteurl = request.POST.get('siteurl', None) clicked_html = request.POST.get('clicked_html', None) if str(clicked_html) == "0": cc.reset() # deleting files on system click_dict.clear() click_dict[cc.counter] = [siteurl, FrameObject.filename] elif str(clicked_html) != "0": click_dict[cc.counter] = [siteurl, FrameObject.filename, clicked_html] cc.add() if siteurl is None: return HttpResponse("none") else: scrapyd = ScrapydAPI("http://localhost:6800") jid = scrapyd.schedule("feedbot", "sitesavespider", domain=siteurl) FrameObject.filename = siteurl.split("://")[1].replace("/", ".") + ".html" jsondata = { "filename": FrameObject.filename, "crawljob": jid, "siteurl": siteurl, "click_dict": click_dict } while (1): if "finished" in scrapyd.job_status("feedbot", jid): jsondata["filename"] = adapt_javascript(jsondata) return JsonResponse(jsondata) return HttpResponse("hello")
def handle_request(request): # get keyword from input keyword = request.GET.get('keyword') # create an object of scrapyd API scrapyd = ScrapydAPI("http://localhost:6800") request_time = datetime.datetime.now() # create a job id job_id = scrapyd.schedule(project='quotes_scrape', spider='quotes_crawler', keyword=keyword, request_time=request_time) qry = f"select * from quotes where job_id = '{job_id}'" job_status = "running" values = [] # check for job status while job_status != "finished": job_status = scrapyd.job_status(project='quotes_scrape', job_id=job_id) if job_status == 'finished': # database connection con = pymysql.connect(host="localhost", user="******", passwd="", db="quotes_scrape", cursorclass=pymysql.cursors.DictCursor) cursor = con.cursor() # get records from database of particular database cursor.execute(qry) values = cursor.fetchall() print(values) else: sleep(1) return JsonResponse(data=values, safe=False)
class Scraper(models.Model): site = models.OneToOneField(Site, on_delete=models.CASCADE) description = models.TextField(null=True, blank=True) file = models.FileField(upload_to=scraper_path, null=True, validators=[validate_py_extension]) task_id = models.CharField(null=True, blank=True, max_length=255) last_scraped = models.DateTimeField(null=True) class Meta: ordering = ['site__name'] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.scrapyd = ScrapydAPI("http://localhost:6800") def start(self): spider_name = "{}_{}_{}".format(self.site.name, self.site.gender, self.site.type) self.task_id = self.scrapyd.schedule("default", spider_name) self.save() def stop(self): self.scrapyd.cancel("default", self.task_id) self.save() def spider_status(self): if self.task_id: job_status = self.scrapyd.job_status('default', self.task_id) return job_status else: return "-"
class ProductChecker(models.Model): name = models.CharField(max_length=255, null=True, unique=True) description = models.TextField(null=True, blank=True) file = models.FileField(upload_to=scraper_path, null=True, validators=[validate_py_extension]) task_id = models.CharField(null=True, blank=True, max_length=255) last_scraped = models.DateTimeField(null=True) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.scrapyd = ScrapydAPI("http://178.128.105.215:6800") def start(self): self.task_id = self.scrapyd.schedule("default", self.name) self.save() def stop(self): self.scrapyd.cancel("default", self.task_id) self.save() def spider_status(self): if self.task_id: job_status = self.scrapyd.job_status('default', self.task_id) return job_status else: return "-"
def check_parser_status(): """ task that checks current status (running, scheduled, finished or empty string if scraper with selected job id not founded) of all scrapes on the database :return: """ spiders = Scraper.objects.all() scrapyd = ScrapydAPI('http://scrapyd:6800') for spider in spiders: status = scrapyd.job_status(BOT_NAME, spider.job_id) spider.status = status spider.save(update_fields=['status'])
class scrapy_test(APIView): def __init__(self): self.scrapyd = ScrapydAPI('http://localhost:6800') self.dist_path = os.path.join(os.path.abspath(os.getcwd()), 'dist') def post(self, request, *args, **kwargs): project_name = request.POST.get('project_name', '') spider_name = request.POST.get('spider_name', '') scrapy_settings = { "FEED_URI": "file:" + os.path.join(self.dist_path, spider_name) + ".json", 'FEED_FORMAT': "json" } job_id = self.scrapyd.schedule(project_name, spider_name, settings=scrapy_settings) # waiting the schedule runing finished while self.scrapyd.job_status(project_name, job_id) != 'finished': print(self.scrapyd.job_status(project_name, job_id)) time.sleep(3) # read spider data with open(os.path.join('.', 'dist', spider_name + '.json'), 'r+') as _f: crawl_data = json.load(_f) # insert data serializer = CrawlerSerializers(data=crawl_data, context=request, many=True) if not serializer.is_valid(): return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) instance = serializer.save() return HttpResponse("Succeed")
def handle(self, *args, **options): username = options['username'] feedname = options['feedname'] element_dict = {} for element in elements: element_dict[element] = get_element_info(element, username, feedname) print element_dict scrapyd = ScrapydAPI("http://localhost:6800") jid = scrapyd.schedule("feedbot", "rssspider", element_dict=json.dumps(element_dict)) print jid print json.dumps(element_dict) if "finished" in scrapyd.job_status("feedbot", jid): return 2
class ScrapyAgent(object): """ scrapy项目代理类 """ def __init__(self, server_url): self.server_url = server_url self.scrapyd_api = ScrapydAPI(server_url) def __repr__(self): return '<ScrapyAgent %s>' % self.server_url @property def server(self): return self.server_url def list_projects(self): return self.scrapyd_api.list_projects() def del_project(self, project_name): try: return self.scrapyd_api.delete_project(project_name) except: return False def list_spiders(self, project_name): return self.scrapyd_api.list_spiders(project_name) def start_spider(self, project_name, spider_name): return self.scrapyd_api.schedule(project_name, spider_name) def cancel_spider(self, project_name, job_id): return self.scrapyd_api.cancel(project_name, job_id) def deploy(self, project_name: str, version: int, egg_byte: BinaryIO) -> "Dict or bool": spider_num = self.scrapyd_api.add_version(project_name, version, egg_byte) return { 'project': project_name, 'version': version, 'spiders': spider_num, } if spider_num else False def log_url(self, project_name, spider_name, job_id): return '{}/logs/{}/{}/{}'\ .format(self.server_url, project_name, spider_name, job_id) def job_status(self, project_name, job_id): return self.scrapyd_api.job_status(project_name, job_id)
def Jobcrawl(request): '''Memulai proses crawling lowongan kerja''' scrapyd = ScrapydAPI('http://localhost:6801') if request.method == 'POST': unique_id = str(uuid4()) # create a unique ID. '''Custom Setting untuk scraping''' settings = { 'unique_id': unique_id, # unique ID for each record for DB 'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' } task = scrapyd.schedule("default1", "jobsID", settings=settings) return JsonResponse({ 'task_id': task, 'unique_id': unique_id, 'status': 'started' }) elif request.method == 'GET': '''Untuk mengecek status crawling''' # We were passed these from past request above. Remember ? # They were trying to survive in client side. # Now they are here again, thankfully. <3 # We passed them back to here to check the status of crawling # And if crawling is completed, we respond back with a crawled data. task_id = request.GET.get('task_id', None) unique_id = request.GET.get('unique_id', None) url = request.GET.get('url', None) if not task_id or not unique_id: return JsonResponse({'error': 'Missing args'}) # Here we check status of crawling that just started a few seconds ago. # If it is finished, we can query from database and get results # If it is not finished we can return active status # Possible results are -> pending, running, finished status = scrapyd.job_status('default1', task_id) '''Jika status = finished, berhenti cek status''' if status == 'finished': return JsonResponse({'data': url, 'status': 'finished'}) else: return JsonResponse({'status': status})
class Scraper: def __init__(self): self.client = ScrapydAPI("http://scrapyd:6800", timeout=10) self.project = 'default' def schedule_spider(self, spider_name: str): print(f"RUN SPIDER: {spider_name}") return self.client.schedule(self.project, spider_name) def cancel_job(self, job_id: str): return self.client.cancel(self.project, job_id) def get_status_of_job(self, job_id: str): return self.client.job_status(self.project, job_id) def get_all_jobs(self): return self.client.list_jobs(self.project) def get_all_spiders(self): return self.client.list_spiders(self.project)
class Scrapyd_Control(object): def __init__(self): scrapyd_url = input('请输入scrapyd地址: ') project = input('请输入项目名称: ') self.project = project self.scrapyd = ScrapydAPI(scrapyd_url) # 启动爬虫 def schedule(self): spider = input('请输入爬虫名称: ') return { 'project': self.project, 'spider': spider, 'jobid': self.scrapyd.schedule(self.project, spider) } start, run = schedule, schedule # 取消爬虫 def cancel(self): jobid = input('请粘贴要取消的爬虫jobid: ') return self.scrapyd.cancel(self.project, jobid) # 查看项目 def listprojects(self): return self.scrapyd.list_projects() # 查看爬虫 def listspiders(self): return self.scrapyd.list_spiders(self.project) # 列出所有jobs def listjobs(self): return self.scrapyd.list_jobs(self.project) # 查看job状态 def jobstatus(self): jobid = input('请粘贴要查看的jobid: ') return self.scrapyd.job_status(self.project, jobid) # 查看版本 def listversions(self): return self.scrapyd.list_versions(self.project) # 删除版本 def delversion(self): version_name = input('请粘贴要删除的版本: ') yes = input('是否确认删除该版本{},请输yes否则回车跳过删除\n'.format(version_name)) if yes == 'yes': return self.scrapyd.delete_version(self.project, version_name) else: pass # 删除项目 def delproject(self): yes = input('是否确认删除该项目{},请输yes否则回车跳过删除\n'.format(self.project)) if yes == 'yes': return self.scrapyd.delete_project(self.project) else: pass # 列出所有命令 def help(self): print(""" 启动爬虫 schedule|start|run 取消爬虫 cancel 查看项目 listprojects 查看爬虫 listspiders 列出所有jobs listjobs 查看job状态 jobstatus 查看版本 listversions 删除版本 delversion 删除项目 deleproject 列出所有命令 help """)
def GetTwitterCrawlerStatus(request): print("Twitter Ajax Calling") _index = 0 status = False global SpiderTwitterJOBID try: # global scrapyd scrapyd = ScrapydAPI('http://127.0.0.1:6800') if SpiderTwitterJOBID != 'SpiderTwitterKey': state = scrapyd.job_status(SCRAPYD_PROJECT_NAME, SpiderTwitterJOBID) print("Twitter JOBID = " + SpiderTwitterJOBID) print("Twitter JOB State = " + state) if state == RUNNING or state == PENDING: status = True else: status = False except ConnectionError: status = False response = [] item = [] score = [] id = 0 if status == True: _index = request.GET.get('index', None) _historyKey = request.GET.get('historyKey', None) print("DB Index = " + str(_index) + " and History key = " + str(_historyKey)) result = TwitterHistory.objects.using('SentimentAppDB').filter( id__gt=_index, historykey=_historyKey).values() if len(list(result)) != 0: for resCrawl in result: res = list( Score.objects.using('SentimentAppDB').filter( id=resCrawl['scoreid_id']).values()) sTextBlob = res[0]['ScoreTextBlob'] scoreTextBlob = "{" + sTextBlob + "}" dt = json.loads(scoreTextBlob) if float(dt['polarity']) >= 0.3: textBlobResult = { 'value': "positive", 'score': str(dt['polarity']) } elif float(dt['polarity']) <= -0.3: textBlobResult = { 'value': "negative", 'score': str(dt['polarity']) } else: textBlobResult = { 'value': "neutral", 'score': str(dt['polarity']) } res[0]['ScoreTextBlob'] = textBlobResult sVader = res[0]['ScoreVader'] scoreVader = "{" + sVader + "}" d = json.loads(scoreVader) if float(d['comp']) >= 0.3: vaderResult = { 'value': "positive", 'score': str(d['comp']) } elif float(d['comp']) <= -0.3: vaderResult = { 'value': "negative", 'score': str(d['comp']) } else: vaderResult = {'value': "neutral", 'score': str(d['comp'])} res[0]['ScoreVader'] = vaderResult sGoogleNLP = res[0]['ScoreGoogleNLP'] scoreGoogleNLP = "{" + sGoogleNLP + "}" da = json.loads(scoreGoogleNLP) if float(da['score']) >= 0.3: googleNLPResult = { 'value': "positive", 'score': str(da['score']) } elif float(da['score']) <= -0.3: googleNLPResult = { 'value': "negative", 'score': str(da['score']) } else: googleNLPResult = { 'value': "neutral", 'score': str(da['score']) } res[0]['ScoreGoogleNLP'] = googleNLPResult sStanfordCoreNLP = res[0]['ScoreStanfordCoreNLP'] scoreStanfordCoreNLP = "{" + sStanfordCoreNLP + "}" da = json.loads(scoreStanfordCoreNLP) if float(da['score']) < 2: stanfordCoreNLP = { 'value': "negative", 'score': str(da['score']) } elif float(da['score']) > 2: stanfordCoreNLP = { 'value': "positive", 'score': str(da['score']) } else: stanfordCoreNLP = { 'value': "neutral", 'score': str(da['score']) } res[0]['ScoreStanfordCoreNLP'] = stanfordCoreNLP sAzure = res[0]['ScoreAzure'] scoreAzure = "{" + sAzure + "}" da = json.loads(scoreAzure) if float(da['score']) < 0.4: azureResult = { 'value': "negative", 'score': str(da['score']) } elif float(da['score']) > 0.6: azureResult = { 'value': "positive", 'score': str(da['score']) } else: azureResult = { 'value': "neutral", 'score': str(da['score']) } res[0]['ScoreAzure'] = azureResult sIBM = res[0]['ScoreIBMNLP'] scoreIBM = "{" + sIBM + "}" da = json.loads(scoreIBM) res[0]['ScoreIBM'] = { 'value': str(da['sentiment']), 'score': str(da['score']) } resCrawl['created_at'] = resCrawl['created_at'] id = resCrawl['id'] item.append(resCrawl) score.append(res) data = { 'value': item, 'score': score, 'status': status, 'index': id } else: data = { 'value': [], 'score': [], 'status': status, 'index': _index } else: print("Job ended.Twitter Scraping done.") data = {'value': [], 'score': [], 'status': status, 'index': _index} response.append(data) return JsonResponse(response, safe=False)
def job_status(project, job, url=DEFAULT_URL): scrapyd = ScrapydAPI(url) return scrapyd.job_status(project, job)
def crawl(request): '''Memulai proses crawling berita''' # Crawling hanya diterima dengan method POST scrapyd = ScrapydAPI('http://localhost:6800') if request.method == 'POST': url = request.POST.get('url', None) #Mengambil url yang diberikan website = request.POST.get('website', None) #Cek apakah benar benar url if not url: return JsonResponse({'error': 'Missing args'}) # Cek apakah url valid if not is_valid_url(url): return JsonResponse({'error': 'URL is invalid'}) # Cek apakah url sudah # if linkURL.objects.filter(link=url).exists(): # return JsonResponse({'error': 'URL sudah tersimpan dalam database'}) # else: # print("Hello World") # d, created = linkURL.objects.get_or_create(link=url) # if created : # d.save() domain = urlparse(url).netloc # parse the url and extract the domain unique_id = str(uuid4()) # create a unique ID. '''Custom Setting untuk scraping''' settings = { 'unique_id': unique_id, # unique ID 'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' } # Here we schedule a new crawling task from scrapyd. # Notice that settings is a special argument name. # But we can pass other arguments, though. # This returns a ID which belongs and will be belong to this task # We are goint to use that to check task's status. task = scrapyd.schedule("default", website, settings=settings, url=url, domain=domain) return JsonResponse({ 'task_id': task, 'unique_id': unique_id, 'status': 'started' }) elif request.method == 'GET': '''Untuk mengecek status crawling''' # We were passed these from past request above. Remember ? # They were trying to survive in client side. # Now they are here again, thankfully. <3 # We passed them back to here to check the status of crawling # And if crawling is completed, we respond back with a crawled data. task_id = request.GET.get('task_id', None) unique_id = request.GET.get('unique_id', None) url = request.GET.get('url', None) if not task_id or not unique_id: return JsonResponse({'error': 'Missing args'}) # Here we check status of crawling that just started a few seconds ago. # If it is finished, we can query from database and get results # If it is not finished we can return active status # Possible results are -> pending, running, finished status = scrapyd.job_status("default", task_id) '''Jika status = finished, berhenti cek status''' if status == 'finished': # d, created = linkURL.objects.get_or_create(link=url) # if created : # d.save() return JsonResponse({'data': url, 'status': 'finished'}) else: return JsonResponse({'status': status})
def status_spider(job_id): scrapyd = ScrapydAPI(SCRAPYD_URL) status = scrapyd.job_status(SCRAPYD_PROJECT, job_id) return status
def GetWebCrawlerStatus(request): print("Ajax Calling - Retrieving web crawled dataset..") _index = 0 status = False global SpiderWebCrawlerJOBID try: # global scrapyd scrapyd = ScrapydAPI('http://127.0.0.1:6800') if SpiderWebCrawlerJOBID != 'SpiderWebCrawlerKey': state = scrapyd.job_status(SCRAPYD_PROJECT_NAME, SpiderWebCrawlerJOBID) print("Web Crawler JOBID = " + SpiderWebCrawlerJOBID) print("Web Crawler JOB State = " + state) if state == RUNNING or state == PENDING: status = True else: status = False except ConnectionError: status = False response = [] item = [] score = [] id = 0 if status == True: _index = request.GET.get('index', None) _historyKey = request.GET.get('historyKey', None) print("Web Crawler DB Index = " + _index + " and HistoryKey = " + _historyKey) # result = WebCrawl.objects.using('SentimentAppDB').raw("SELECT * FROM [dbo].[CrawlResult] where [id] > " + str(_index) + " and [HistoryKey] = '" + _historyKey + "'") result = WebCrawl.objects.using('SentimentAppDB').filter( id__gt=_index, HistoryId=_historyKey).values() # if len(list(result)) != 0: # for resCrawl in result: # print(resCrawl.scoreid_id) # res = list(Score.objects.using('SentimentAppDB').filter(id=resCrawl.scoreid_id).values()) if len(list(result)) != 0: for resCrawl in result: res = list( Score.objects.using('SentimentAppDB').filter( id=resCrawl['scoreid_id']).values()) sTextBlob = res[0]['ScoreTextBlob'] scoreTextBlob = "{" + sTextBlob + "}" dt = json.loads(scoreTextBlob) if float(dt['polarity']) >= 0.3: textBlobResult = { 'value': "positive", 'score': str(dt['polarity']) } elif float(dt['polarity']) <= -0.3: textBlobResult = { 'value': "negative", 'score': str(dt['polarity']) } else: textBlobResult = { 'value': "neutral", 'score': str(dt['polarity']) } res[0]['ScoreTextBlob'] = textBlobResult sVader = res[0]['ScoreVader'] scoreVader = "{" + sVader + "}" d = json.loads(scoreVader) if float(d['comp']) >= 0.3: vaderResult = { 'value': "positive", 'score': str(d['comp']) } elif float(d['comp']) <= -0.3: vaderResult = { 'value': "negative", 'score': str(d['comp']) } else: vaderResult = {'value': "neutral", 'score': str(d['comp'])} res[0]['ScoreVader'] = vaderResult sGoogleNLP = res[0]['ScoreGoogleNLP'] scoreGoogleNLP = "{" + sGoogleNLP + "}" da = json.loads(scoreGoogleNLP) if float(da['score']) >= 0.3: googleNLPResult = { 'value': "positive", 'score': str(da['score']) } elif float(da['score']) <= -0.3: googleNLPResult = { 'value': "negative", 'score': str(da['score']) } else: googleNLPResult = { 'value': "neutral", 'score': str(da['score']) } res[0]['ScoreGoogleNLP'] = googleNLPResult sStanfordCoreNLP = res[0]['ScoreStanfordCoreNLP'] scoreStanfordCoreNLP = "{" + sStanfordCoreNLP + "}" da = json.loads(scoreStanfordCoreNLP) if float(da['score']) < 2: stanfordCoreNLP = { 'value': "negative", 'score': str(da['score']) } elif float(da['score']) > 2: stanfordCoreNLP = { 'value': "positive", 'score': str(da['score']) } else: stanfordCoreNLP = { 'value': "neutral", 'score': str(da['score']) } res[0]['ScoreStanfordCoreNLP'] = stanfordCoreNLP sAzure = res[0]['ScoreAzure'] scoreAzure = "{" + sAzure + "}" da = json.loads(scoreAzure) if float(da['score']) < 0.4: azureResult = { 'value': "negative", 'score': str(da['score']) } elif float(da['score']) > 0.6: azureResult = { 'value': "positive", 'score': str(da['score']) } else: azureResult = { 'value': "neutral", 'score': str(da['score']) } res[0]['ScoreAzure'] = azureResult resCrawl['entryTime'] = resCrawl['entryTime'].strftime( "%b %d %Y %H:%M:%S") id = resCrawl['id'] item.append(resCrawl) score.append(res) print("LAST Row ID = " + str(id)) data = { 'value': item, 'score': score, 'status': status, 'index': id } else: data = { 'value': [], 'score': [], 'status': status, 'index': _index } else: print("Job ended. Crawling done.") data = {'value': [], 'score': [], 'status': status, 'index': _index} response.append(data) return JsonResponse(response, safe=False)