Beispiel #1
0
def test_list_page(request):  # 爬取数据
    resolve_key = 'error is no resolve_key!'
    # request.POST
    if request.method == 'POST':
        service_type = request.POST.get('service_type', '对外招标')
        rerun = request.POST.get('rerun', False)
        if rerun or rerun == 'true':
            rerun = True
        b_date = request.POST.get('b_date', util.getDT())  # '2018-7-16'
        e_date = request.POST.get('e_date', util.getDT())

        b_date = util.getDT() if b_date == '' else b_date
        e_date = util.getDT() if e_date == '' else e_date

        resolve_key = str(
            (service_type, 'tt', b_date, e_date))  # 构建 resolve_key
        request.session['resolve_key'] = resolve_key

        if (not rerun) and resolve_key in cache.Data.db_pac.keys(
        ):  # 判断是否已有数据,根据 resolve_key,并且数据库数据未有变化
            db_dbo_new = SpidersBaseSource.get_by_resolve_key(resolve_key)
            if len(db_dbo_new) != len(
                    cache.Data.txos[resolve_key].sdata.db_dbo) or str(
                        db_dbo_new) != str(
                            cache.Data.txos[resolve_key].sdata.db_dbo):
                test_to_runing(request, resolve_key, db_dbo_new)  # 运行爬虫
        else:
            test_to_runing(request, resolve_key)  # 运行爬虫

    return HttpResponse(resolve_key)
Beispiel #2
0
    def test_list_page_runing(self, request, resolve_key):

        # request.POST
        if request.method == 'POST':
            service_type = request.POST.get('service_type', '对外招标')
            tags = request.POST.get('tags', 'tt')
            b_date = request.POST.get('b_date', util.getDT())  # '2018-7-16'
            e_date = request.POST.get('e_date', util.getDT())

            # base_sources = request.POST.get('base_sources', [])  # 指定网站

            b_date = util.getDT() if b_date == '' else b_date
            e_date = util.getDT() if e_date == '' else e_date

            # if (not self.sdata.db_dbo) or len(self.sdata.db_dbo) < 1:
            #    self.sdata.db_dbo = SpidersBaseSource.get_SpidersBaseSource_by_resolve_key(resolve_key)
            self.sdata.dbo_resolve_key = resolve_key  # str((service_type, b_date, e_date))
            self.sdata.dbo_urls = {}
            print('self.MyThread().start() '+str(self.sdata.dbo_resolve_key))
            ret = self.MyThread(self.sdata.db_dbo, b_date, e_date, self)

            ret.start()
            self.sdata.test_list_page_threads[ret.getName()] = ret
            print('self.MyThread().start().getName() '+str(ret.getName()))
            ret.join()
Beispiel #3
0
    def test_run(self, url, pw, pn, pt, pg, prs, pr, b_date, e_date, s_data):
        if b_date == '' and e_date == '':
            self._com_date = False
        else:
            self._com_date = True
            self._b_date = parser.parse(
                b_date if b_date != '' else '2000-1-1')  # 默认开始时间 - '2000-1-1'
            self._e_date = parser.parse(
                e_date if e_date != '' else util.getDT())  # 默认结束时间 - 今天

        self._s_data = s_data
        # self._run_count = s_data.run_count
        self._run_count = int(pt)
        self._runing = True
        dbo = [pw, pn, prs, pr]

        options = Options()
        if not set.EXECUTABLE_RUN_SHOW:
            options.add_argument('-headless')  # 无头参数
            options.add_argument(
                '--no-sandbox')  # 禁用某个功能,该错误测了两天,颤毛啊!。(原因是运行Chrome浏览器报错,火狐不行)
            # options.add_argument('--disable-dev-shm-usage')
        driver = Chrome(executable_path=set.EXECUTABLE_PATH,
                        chrome_options=options)  # 配了环境变量第一个参数就可以省了,不然传绝对路径
        driver.set_script_timeout(set.WEBDRIVERWAIT_TIMEOUT)
        driver.get(url)
        retss = self.processRequest(driver, dbo)  # 执行
        # self.printT('processRequest(driver, dbo) ed')  # 打印
        self.printT('有效总数据:' + str(len(retss)))  # 打印
        driver.quit()
        self.printT('driver.quit() end')  # 打印结束
        for r in retss:
            self.printT(r + '=' + str(retss.get(r)))  # 打印
        self.printT(url + '_end')  # 打印结束
Beispiel #4
0
def addSpidersBase(request):  # 添加
    ret = 'no body!'
    status = '401'
    # request.POST
    if request.method == 'POST':
        sbs = SpidersBaseSource()
        sbs.hash_id = hash(util.getDT(2))
        sbs.service_type = request.POST.get('service_type', None)
        sbs.area = request.POST.get('area', None)
        sbs.province = request.POST.get('province', None)
        sbs.city = request.POST.get('city', None)
        sbs.tags = request.POST.get('tags', None)
        sbs.update_time = util.getDT(2)
        sbs.url_source = request.POST.get('url_source', None)
        sbs.url_type = request.POST.get('url_type', None)
        sbs.resolve_type = request.POST.get('resolve_type', None)
        sbs.resolve_rule = request.POST.get('resolve_rule', None)
        sbs.resolve_source = request.POST.get('resolve_source', None)
        sbs.resolve_sources = request.POST.get('resolve_sources', None)
        sbs.resolve_next_page = request.POST.get('resolve_next_page', None)
        sbs.resolve_page_wait = request.POST.get('resolve_page_wait', None)
        # sbs.run_time = request.POST.get('run_time', None)
        sbs.run_count = request.POST.get('run_count', 0)
        sbs.content_page_rule = {}
        sbs.content_page_rule['resolve_type'] = request.POST.get(
            'cresolve_type', None)
        sbs.content_page_rule['resolve_rule'] = request.POST.get(
            'cresolve_rule', None)
        sbs.content_page_rule['resolve_source'] = request.POST.get(
            'cresolve_source', None)
        sbs.bz1 = request.POST.get('bz1', None)
        sbs.bz2 = request.POST.get('bz2', None)
        print(sbs)
        print('sbs.save()')
        ret = sbs.save()
        print('return HttpResponse(str(ret))')
        ret = str(sbs.id)
        status = '200'
    return HttpResponse({'status': status, 'tid': ret})
Beispiel #5
0
 def print_execl_test_list_page(self, request):
     # self.sdata.dbo_urls  # {url, [dbo]}
     dboss = []
     dboss.append(['网站根目录', '更新标题', '更新地址(双击访问)', '更新时间'])
     for uk in self.sdata.dbo_urls:
         for dbo in self.sdata.dbo_urls.get(uk):
             row = []
             row.append(uk)
             for dbok in dbo:
                 # row.append(dbok)
                 row.append(str(dbo.get(dbok)).strip())
             dboss.append(row)
     print(dboss)
     down_url = sets.DOWN_URL + 'pac_excel' + util.getDT() + '_' + str(time.time())[-3:] + '.xls'
     down_url = util.biuderEXECL(dboss, down_url)  # [[]]
     # return HttpResponse(str(self.sdata.dbo_urls))
     return render(request, 'core/down_excel.html', {'down_url': down_url[down_url.find('/pac_excel'):], 'data': self.sdata.dbo_urls})
Beispiel #6
0
    def test_run(self, url, pw, pn, pt, pg, prs, pr, b_date, e_date, s_data):
        if b_date == '' and e_date == '':
            self._com_date = False
        else:
            self._com_date = True
            self._b_date = parser.parse(b_date if b_date != '' else '2000-1-1')  # 默认开始时间 - '2000-1-1'
            self._e_date = parser.parse(e_date if e_date != '' else util.getDT())  # 默认结束时间 - 今天
        try:
            self._s_data = s_data
            self._run_count = s_data.run_count
            self._runing = True
            dbo = [pw, pn, prs, pr]

            options = Options()
            if not sets.EXECUTABLE_RUN_SHOW:
                options.add_argument('-headless')  # 无头参数
                options.add_argument('--no-sandbox')  # 禁用某个功能,该错误测了两天,颤毛啊!。(原因是运行Chrome浏览器报错,火狐不行)
                # options.add_argument('--disable-dev-shm-usage')
            driver = Chrome(executable_path=sets.EXECUTABLE_PATH, chrome_options=options)  # 配了环境变量第一个参数就可以省了,不然传绝对路径
            driver.set_script_timeout(sets.WEBDRIVERWAIT_TIMEOUT)
            driver.get(url)  # 加载

            retss = self.processRequest(url, dbo, driver)  # 执行

            driver.quit()
            print('driver.quit() end')
            if not self._runing:
                # self._s_data.dbo_urls_b = False  # 暂不标记为 全局数据无效
                self._s_data.up_dbo_error_urls[url] = self._e  # 异常数据 url 个更新状态到数据
            # self.printT('processRequest(url, dbo, driver) ed')
            self.printT('有效总数据:' + str(len(retss)))
            for r in retss:
                self.printT(r+'='+str(retss.get(r)))
            self.printT(url + '_end')  # 打印结束
        except BaseException as e:
            # self._s_data.dbo_urls_b = False  # 暂不标记为 全局数据无效
            self._e = e
            self._s_data.up_dbo_error_urls[url] = self._e  # 异常数据 url 个更新状态到数据
            self.printT('test_run ERROR e='+str(e))
            driver.quit()
            print('driver.quit() end')