def test_list_page(request): # 爬取数据 resolve_key = 'error is no resolve_key!' # request.POST if request.method == 'POST': service_type = request.POST.get('service_type', '对外招标') rerun = request.POST.get('rerun', False) if rerun or rerun == 'true': rerun = True b_date = request.POST.get('b_date', util.getDT()) # '2018-7-16' e_date = request.POST.get('e_date', util.getDT()) b_date = util.getDT() if b_date == '' else b_date e_date = util.getDT() if e_date == '' else e_date resolve_key = str( (service_type, 'tt', b_date, e_date)) # 构建 resolve_key request.session['resolve_key'] = resolve_key if (not rerun) and resolve_key in cache.Data.db_pac.keys( ): # 判断是否已有数据,根据 resolve_key,并且数据库数据未有变化 db_dbo_new = SpidersBaseSource.get_by_resolve_key(resolve_key) if len(db_dbo_new) != len( cache.Data.txos[resolve_key].sdata.db_dbo) or str( db_dbo_new) != str( cache.Data.txos[resolve_key].sdata.db_dbo): test_to_runing(request, resolve_key, db_dbo_new) # 运行爬虫 else: test_to_runing(request, resolve_key) # 运行爬虫 return HttpResponse(resolve_key)
def test_list_page_runing(self, request, resolve_key): # request.POST if request.method == 'POST': service_type = request.POST.get('service_type', '对外招标') tags = request.POST.get('tags', 'tt') b_date = request.POST.get('b_date', util.getDT()) # '2018-7-16' e_date = request.POST.get('e_date', util.getDT()) # base_sources = request.POST.get('base_sources', []) # 指定网站 b_date = util.getDT() if b_date == '' else b_date e_date = util.getDT() if e_date == '' else e_date # if (not self.sdata.db_dbo) or len(self.sdata.db_dbo) < 1: # self.sdata.db_dbo = SpidersBaseSource.get_SpidersBaseSource_by_resolve_key(resolve_key) self.sdata.dbo_resolve_key = resolve_key # str((service_type, b_date, e_date)) self.sdata.dbo_urls = {} print('self.MyThread().start() '+str(self.sdata.dbo_resolve_key)) ret = self.MyThread(self.sdata.db_dbo, b_date, e_date, self) ret.start() self.sdata.test_list_page_threads[ret.getName()] = ret print('self.MyThread().start().getName() '+str(ret.getName())) ret.join()
def test_run(self, url, pw, pn, pt, pg, prs, pr, b_date, e_date, s_data): if b_date == '' and e_date == '': self._com_date = False else: self._com_date = True self._b_date = parser.parse( b_date if b_date != '' else '2000-1-1') # 默认开始时间 - '2000-1-1' self._e_date = parser.parse( e_date if e_date != '' else util.getDT()) # 默认结束时间 - 今天 self._s_data = s_data # self._run_count = s_data.run_count self._run_count = int(pt) self._runing = True dbo = [pw, pn, prs, pr] options = Options() if not set.EXECUTABLE_RUN_SHOW: options.add_argument('-headless') # 无头参数 options.add_argument( '--no-sandbox') # 禁用某个功能,该错误测了两天,颤毛啊!。(原因是运行Chrome浏览器报错,火狐不行) # options.add_argument('--disable-dev-shm-usage') driver = Chrome(executable_path=set.EXECUTABLE_PATH, chrome_options=options) # 配了环境变量第一个参数就可以省了,不然传绝对路径 driver.set_script_timeout(set.WEBDRIVERWAIT_TIMEOUT) driver.get(url) retss = self.processRequest(driver, dbo) # 执行 # self.printT('processRequest(driver, dbo) ed') # 打印 self.printT('有效总数据:' + str(len(retss))) # 打印 driver.quit() self.printT('driver.quit() end') # 打印结束 for r in retss: self.printT(r + '=' + str(retss.get(r))) # 打印 self.printT(url + '_end') # 打印结束
def addSpidersBase(request): # 添加 ret = 'no body!' status = '401' # request.POST if request.method == 'POST': sbs = SpidersBaseSource() sbs.hash_id = hash(util.getDT(2)) sbs.service_type = request.POST.get('service_type', None) sbs.area = request.POST.get('area', None) sbs.province = request.POST.get('province', None) sbs.city = request.POST.get('city', None) sbs.tags = request.POST.get('tags', None) sbs.update_time = util.getDT(2) sbs.url_source = request.POST.get('url_source', None) sbs.url_type = request.POST.get('url_type', None) sbs.resolve_type = request.POST.get('resolve_type', None) sbs.resolve_rule = request.POST.get('resolve_rule', None) sbs.resolve_source = request.POST.get('resolve_source', None) sbs.resolve_sources = request.POST.get('resolve_sources', None) sbs.resolve_next_page = request.POST.get('resolve_next_page', None) sbs.resolve_page_wait = request.POST.get('resolve_page_wait', None) # sbs.run_time = request.POST.get('run_time', None) sbs.run_count = request.POST.get('run_count', 0) sbs.content_page_rule = {} sbs.content_page_rule['resolve_type'] = request.POST.get( 'cresolve_type', None) sbs.content_page_rule['resolve_rule'] = request.POST.get( 'cresolve_rule', None) sbs.content_page_rule['resolve_source'] = request.POST.get( 'cresolve_source', None) sbs.bz1 = request.POST.get('bz1', None) sbs.bz2 = request.POST.get('bz2', None) print(sbs) print('sbs.save()') ret = sbs.save() print('return HttpResponse(str(ret))') ret = str(sbs.id) status = '200' return HttpResponse({'status': status, 'tid': ret})
def print_execl_test_list_page(self, request): # self.sdata.dbo_urls # {url, [dbo]} dboss = [] dboss.append(['网站根目录', '更新标题', '更新地址(双击访问)', '更新时间']) for uk in self.sdata.dbo_urls: for dbo in self.sdata.dbo_urls.get(uk): row = [] row.append(uk) for dbok in dbo: # row.append(dbok) row.append(str(dbo.get(dbok)).strip()) dboss.append(row) print(dboss) down_url = sets.DOWN_URL + 'pac_excel' + util.getDT() + '_' + str(time.time())[-3:] + '.xls' down_url = util.biuderEXECL(dboss, down_url) # [[]] # return HttpResponse(str(self.sdata.dbo_urls)) return render(request, 'core/down_excel.html', {'down_url': down_url[down_url.find('/pac_excel'):], 'data': self.sdata.dbo_urls})
def test_run(self, url, pw, pn, pt, pg, prs, pr, b_date, e_date, s_data): if b_date == '' and e_date == '': self._com_date = False else: self._com_date = True self._b_date = parser.parse(b_date if b_date != '' else '2000-1-1') # 默认开始时间 - '2000-1-1' self._e_date = parser.parse(e_date if e_date != '' else util.getDT()) # 默认结束时间 - 今天 try: self._s_data = s_data self._run_count = s_data.run_count self._runing = True dbo = [pw, pn, prs, pr] options = Options() if not sets.EXECUTABLE_RUN_SHOW: options.add_argument('-headless') # 无头参数 options.add_argument('--no-sandbox') # 禁用某个功能,该错误测了两天,颤毛啊!。(原因是运行Chrome浏览器报错,火狐不行) # options.add_argument('--disable-dev-shm-usage') driver = Chrome(executable_path=sets.EXECUTABLE_PATH, chrome_options=options) # 配了环境变量第一个参数就可以省了,不然传绝对路径 driver.set_script_timeout(sets.WEBDRIVERWAIT_TIMEOUT) driver.get(url) # 加载 retss = self.processRequest(url, dbo, driver) # 执行 driver.quit() print('driver.quit() end') if not self._runing: # self._s_data.dbo_urls_b = False # 暂不标记为 全局数据无效 self._s_data.up_dbo_error_urls[url] = self._e # 异常数据 url 个更新状态到数据 # self.printT('processRequest(url, dbo, driver) ed') self.printT('有效总数据:' + str(len(retss))) for r in retss: self.printT(r+'='+str(retss.get(r))) self.printT(url + '_end') # 打印结束 except BaseException as e: # self._s_data.dbo_urls_b = False # 暂不标记为 全局数据无效 self._e = e self._s_data.up_dbo_error_urls[url] = self._e # 异常数据 url 个更新状态到数据 self.printT('test_run ERROR e='+str(e)) driver.quit() print('driver.quit() end')