Beispiel #1
0
    def crawlProxy(self):
        """
        爬取、解析、清洗代理IP,并存储到数据库中
        :return: 代理 IP()
        """
        try:
            print("<*> 开始爬取数据")
            headers = {"User-Agent":self.User_Agent}
            # res = requests.get(self.proxy_url+'/nn/1', headers=headers).text
            # p_nums = '<a href="/nn/.*?">(.*?)</a>'
            # page_nums = re.findall(p_nums, res, re.S)
            # page_num = int(page_nums[3])
            page_num = 3
            for page in range(page_num):
                proxy_list = []
                proxy_url_n = self.proxy_url + '/nn/' + str(page)
                html = requests.get(proxy_url_n, headers=headers, timeout=20).text
                selector = Selector(text=html)
                all_trs = selector.css("#ip_list tr")
                for tr in all_trs[1:]:
                    speed_str = tr.css(".bar::attr(title)").extract()[0]
                    if speed_str:
                        speed = float(speed_str.split("秒")[0])
                    else:
                        speed = "none"
                    all_texts = tr.css("td::text").extract()

                    ip = all_texts[0]
                    port = all_texts[1]
                    protocal = all_texts[5]
                    proxy_list.append((protocal, ip, port, speed))
                print("<+> 第%d页爬取成功" %page)

                print("<*> 开始存储第%d页代理" %page)
                DB = MysqlOperate().db
                cur = DB.cursor()
                for proxy in proxy_list:
                    order = 'insert into proxy values(%s,%s,%s,%s)'
                    cur.execute(order, (proxy[0],proxy[1],proxy[2],proxy[3]))
                    DB.commit()
                    print("<+> 存储成功:{0}://{1}:{2}-{3}".format(proxy[0],proxy[1],proxy[2],proxy[3]))
                print("<+> 第%d页存储成功" %page)
                cur.close()
                DB.close()

        except:
            print("<-> 爬取失败")
            traceback.print_exc()
Beispiel #2
0
 def rebuildProxy(self):
     """
     重建代理池
     :return:
     """
     print("<*> 开始清理代理池")
     try:
         # 清除表内容,不能用drop方法,这样会将整个表删除
         DB = MysqlOperate().db
         cur = DB.cursor()
         cur.execute('truncate proxy')
         DB.commit()
         cur.close()
         DB.close()
         print("<+> 清理完成")
         self.crawlProxy()
     except:
         print("<-> 清理失败")
         traceback.print_exc()
Beispiel #3
0
    def storeSZSGGZYJYPT(self, data):
        try:
            datetimes_list, hrefs_list, titles_list, budgets_list, deadlines_list = data[0], data[1], data[2], data[3], \
                                                                                    data[4]

            print("<*> 开始存储数据.....")
            DB = MysqlOperate().db
            cur = DB.cursor()
            for i in range(len(hrefs_list)):
                order = 'insert into tenders values(%s,%s,%s,%s,%s,%s,%s)'
                cur.execute(order, (
                '深圳市公共资源交易平台',datetimes_list[i], titles_list[i], budgets_list[i], deadlines_list[i], hrefs_list[i],"no"))
            DB.commit()
            cur.close()
            DB.close()
            print("<+> 数据存储成功")
        except:
            print("<-> 数据存储失败")
            traceback.print_exc()
Beispiel #4
0
 def getProxy(self):
     """
     从数据库中随机获取一个可以用的代理
     :return:返回字典类型的取值结果
     """
     try:
         print("<*> 正在从数据库中随机获取代理 IP")
         DB = MysqlOperate().db
         cur = DB.cursor()
         order = 'select * from proxy ORDER BY RAND() LIMIT 1'
         cur.execute(order)
         random_proxy = cur.fetchall()[0]
         DB.commit()
         cur.close()
         DB.close()
         print("<+> 随机IP为:  {0}:{1}:{2}".format(random_proxy[0],random_proxy[1], random_proxy[2]))
         print("<+> 获取代理成功")
         return random_proxy
     except:
         print("<-> 获取代理失败")
         traceback.print_exc()
Beispiel #5
0
    def storeGDSZFCGW(self, data):
        try:
            datetimes_list, hrefs_list, titles_list, budgets_list, deadlines_list = data[0], data[1], data[2], data[3], \
                                                                                    data[4]

            print("<*> 开始存储数据.....")
            DB = MysqlOperate().db
            cur = DB.cursor()
            for i in range(len(hrefs_list)):
                order = 'insert into tenders values(%s,%s,%s,%s,%s,%s,%s)'
                cur.execute(order, (
                '广东省政府采购网',datetimes_list[i], titles_list[i], budgets_list[i], deadlines_list[i], hrefs_list[i],"no"))
            DB.commit()
            cur.close()
            DB.close()
                # data_dict = {"datetimes":datetimes_list[i], "href":hrefs_list[i], "titles":titles_list[i],
                #              "budgets":budgets_list[i], "deadlines":deadlines_list[i], "send":"no"}
                # self.DB.insert(data_dict)

            print("<+> 数据存储成功")
        except:
            print("<-> 数据存储失败")
            traceback.print_exc()
Beispiel #6
0
    def writeMail(self):
        # 获取上次存储的时间
        # tender_info = dbOperation.MongoOperate().collection1
        # last_time = end
        # tender_time = tender_info.find({"datetimes": {"$gt": last_time}}, {"_id": 0, "datetimes":1})
        # datetimes_list = [datetime.strptime(i,"%Y-%m-%d %H:%M") for i in tender_time]
        # start = datetime.strftime(min(datetimes_list),"%Y-%m-%d %H:%M")
        # end = datetime.strftime(max(datetimes_list),"%Y-%m-%d %H:%M")
        # 获取更新时间内的信息,装换为DataFrame格式
        # tender = tender_info.find_one({},{"_id": 0})
        # tender_df = pd.DataFrame(tender,index=[0])
        #tenders = tender_info.find({"datetimes":{"$gte":start,"$lte":end}},{"_id":0})
        # tenders = tender_info.find({"send":"no"},{"_id":0})
        # for t in tenders:
        #     t_s = pd.Series(t)
        #     tender_df = tender_df.append(t_s, ignore_index=True)
        # tender_info.update({"send":"no"},{"$set":{"send":"yes"}})

        # 获取发送字段为no的记录
        DB = MysqlOperate().db
        cur = DB.cursor()
        order1 = 'select * from tenders where send=%s'
        cur.execute(order1, ("no"))
        tender_list = list(cur.fetchall())
        # 获取更新时间内的信息,装换为DataFrame格式
        tender_df = pd.DataFrame(tender_list,
                                 columns=[
                                     'source', 'datetimes', 'titles',
                                     'budgets', 'deadlines', 'hrefs', 'send'
                                 ])
        # 获取发送字段为yes
        order2 = 'update tenders set send=%s where send=%s'
        cur.execute(order2, ("yes", "no"))
        DB.commit()
        cur.close()
        DB.close()

        end = max(tender_df['datetimes']).split(' ')[0]
        start = min(tender_df['datetimes']).split(' ')[0]

        # 生成邮件正文文字
        mail_text = """
        <p><b>以下内容为{0}至{1}的招投标信息,详情请看Excel附件。</b></p>
        """.format(start, end)
        # 生成邮件正文表格
        old_width = pd.get_option('display.max_colwidth')  # 先把原来的显示列宽存下来
        pd.set_option('display.max_colwidth',
                      -1)  # 设置一个新的列宽,宽度是-1,应该是显示宽度去自适应内容
        mail_table = tender_df.to_html(
            escape=False,
            index=False,
            sparsify=True,
            border=2,
            index_names=False
        )  # escape是指,要不要把单元格的内容中那些像html的标签,就写成html的标签,否则,就是纯文本形式。
        pd.set_option('display.max_colwidth', old_width)  # 把df的显示列宽恢复到默认值

        # 生成Excel邮件附件
        print("<*> 正在生成招投标信息文件.....")
        #attachment_path = "H:\\DIBS\\tenders\\tenders_%s %s.xlsx"%(start,end)
        attachment_path = "H:\\DIBS\\tenders\\tenders_{0}~{1}.xlsx".format(
            start, end)
        try:
            tender_df.to_excel(attachment_path, index=False)
            print("<+> 生成招投标信息文件成功")
        except:
            print("<-> 生成招投标信息文件失败")
            traceback.print_exc()
        return mail_text, mail_table, attachment_path
Beispiel #7
0
class PageResolver(object):
    def __init__(self):
        print('<*> 正在实例化网页解析器.....')

        self.executable_path=r'D:\Anaconda3\Scripts\chromedriver.exe'
        # self.executable_path = r'E:\ProgramData\Anaconda3\Scripts\chromedriver.exe'

        self.chrome_options = webdriver.ChromeOptions()
        self.chrome_options.add_argument('--headless')
        # proxy = ProxyPool().getProxy()
        # chrome_options.add_argument("--proxy-server={0}://{1}:{2}".format
        #                             (proxy[0], proxy[1], proxy[2]))


        self.DB = MysqlOperate().db
        self.keywords = configManager.keywords
        self.qualifications = configManager.qualifications

    def resovleGDSZFCGW(self):
        try:
            print('<*> 正在解析网页.....')
            datetimes_list = []
            hrefs_list = []
            titles_list = []
            budgets_list = []
            deadlines_list = []
            # 获取数据库中上次的时间
            # last_time = self.table.find({}, {"datetimes": 1, "_id": 0}).sort("datetimes", -1).limit(1)
            # datetime_list = list(last_time)
            # last_time = datetime_list[0]["datetimes"]
            # last_time = datetime.datetime.strptime(last_time, "%Y-%m-%d %H:%M")
            order = 'select max(datetimes) from tenders where source=%s'
            cur = self.DB.cursor()
            cur.execute(order,'广东省政府采购网')
            last_time = cur.fetchall()[0][0]
            last_time = datetime.datetime.strptime(last_time, "%Y-%m-%d %H:%M")
            self.DB.commit()
            cur.close()
            self.DB.close()

            # 获取html网页信息
            browser = webdriver.Chrome(executable_path=self.executable_path, chrome_options=self.chrome_options)
            url = configManager.tender_urls["广东省政府采购网"]
            browser.get(url)
            time.sleep(2)
            browser.maximize_window()

            main_html = browser.page_source
            time.sleep(2)
            p_num = '<span class="aspan">(.*?)</span>'
            num = re.findall(p_num, main_html, re.S)
            page_num = int(num[-1]) - 1  # 减1防止索引溢出
            for i in range(page_num):
                print("<*> 正在解析第{0}个网页.....".format(i+1))
                # 解析时间,招标信息链接,标题,检查信息是否更新
                page_html = browser.page_source
                time.sleep(2)
                p_list = '<ul class="m_m_c_list">(.*?)</ul>'
                li_list = re.findall(p_list, page_html, re.S)
                lis = ''.join(li_list)
                p_datetime = '<em>(.*?)</em>'
                p_href = '<a href="/showNotice/id/(.*?).html"'
                p_title = '<a href=".*?title="(.*?)">'
                datetimes = re.findall(p_datetime, lis, re.S)
                hrefs = re.findall(p_href, lis, re.S)
                titles = re.findall(p_title, lis, re.S)
                # 检查网页招标信息是否更新
                # 获取网页中最早和最晚时间
                html_time = [datetime.datetime.strptime(i,"%Y-%m-%d %H:%M") for i in datetimes]
                #min_time = min(html_time)
                max_time = max(html_time)
                # 若数据库中上次的时间大于本次网页最大时间,则退出本次爬取
                if last_time >= max_time:
                    print('<-> 此网页招标信息未更新,本次爬取结束,处理已爬取网页')
                    break
                else:
                    for i in range(len(html_time)):
                        if html_time[i] > last_time:
                            datetimes[i] = datetimes[i]
                            hrefs[i] = hrefs[i]
                            titles[i] = titles[i]
                        else:
                            datetimes[i] = "none"
                            hrefs[i] = "none"
                            titles[i] = "none"
                    deadlines = []
                    budgets = []
                    for j in range(len(hrefs)):
                        time.sleep(1)
                        if hrefs[j] != "none":
                            full_href = 'http://www.gdgpo.gov.cn/showNotice/id/' + hrefs[j] + '.html'
                            User_Agent = random.choice(configManager.User_Agent)
                            headers = {"User-Agent": User_Agent}
                            content_html = requests.get(full_href, headers=headers).text
                            p_budget = '<div class="zw_c_c_qx">.*?<span>预算金额:(.*?)元</span>.*?</div>'
                            p_deadline = '<p style="text-indent:.*?截止时间:(.*?)</span></p>'
                            budget = re.findall(p_budget, content_html, re.S)
                            deadline = re.findall(p_deadline, content_html, re.S)

                            # 检查网页招标信息是否适合
                            keyword_count = 0
                            for keyword in self.keywords:
                                if keyword in content_html:
                                    keyword_count += 1
                            if keyword_count < 1:
                                print("<-> 此招标信息不符合公司业务,此次爬取终止,继续爬取下一页信息")
                                datetimes[j] = "none"
                                hrefs[j] = "none"
                                titles[j] = "none"
                                budgets.append("none")
                                deadlines.append("none")
                            else:
                                if len(deadline) != 0:
                                    deadlines.append(deadline[0])
                                else:
                                    deadlines.append("null")
                                if len(budget) != 0:
                                    budgets.append(budget[0])
                                else:
                                    budgets.append("null")

                            # 检查网页招标信息是否符合资质
                            # qua_count = 0
                            # for qua in self.qualifications:
                            #     if qua in content_html:
                            #         qua_count += 1
                            # if qua_count < 1:
                            #     print("<-> 公司资质不符合此招标信息,此次爬取终止,继续爬取下一条信息")
                            #     datetimes[i] = "none"
                            #     hrefs[i] = "none"
                            #     titles[i] = "none"
                            #     budgets[i] = "none"
                            #     deadlines[i] = "none"
                            #     break

                datetimes_list.extend(datetimes)
                hrefs_list.extend(hrefs)
                titles_list.extend(titles)
                budgets_list.extend(budgets)
                deadlines_list.extend(deadlines)

                browser.find_element_by_xpath('//*[@id="contianer"]/div[3]/div[2]/div[3]/div/form/a[8]/span').click()

            browser.quit()

            # 清除“none”值
            while "none" in datetimes_list:
                datetimes_list.remove("none")
            while "none" in hrefs_list:
                hrefs_list.remove("none")
            while "none" in titles_list:
                titles_list.remove("none")
            while "none" in budgets_list:
                budgets_list.remove("none")
            while "none" in deadlines_list:
                deadlines_list.remove("none")

            if len(deadlines_list) < 1:
                print("<-> 网页解析成功,但并无信息更新")
                update_flag = False
                resolved_data = [[],[],[],[],[]]
                return resolved_data, update_flag
            else:
                print("<+> 网页解析成功,已有信息更新")
                update_flag = True
                resolved_data = [datetimes_list, hrefs_list, titles_list, budgets_list, deadlines_list]
                #print(len(datetimes_list),len(hrefs_list),len(titles_list),len(budgets_list),len(deadlines_list))
                print('<+> 解析的信息条数:%d'%len(deadlines_list))
                return resolved_data, update_flag



        except:
            print('<-> 网页解析失败')
            traceback.print_exc()
            return [[],[],[],[],[]],False

    def resovleGZGGZYJYGGFWPT(self):
        try:
            print('<*> 正在解析网页.....')
            datetimes_list = []
            hrefs_list = []
            titles_list = []
            budgets_list = []
            deadlines_list = []

            # 获取数据库中上次的时间
            order = 'select max(datetimes) from tenders where source=%s'
            cur = self.DB.cursor()
            cur.execute(order,'广州公共资源交易公共服务平台')
            last_time = cur.fetchall()[0][0]
            try:
                last_time = datetime.datetime.strptime(last_time, "%Y-%m-%d %H:%M")
            except:
                last_time = datetime.datetime.strptime(last_time, "%Y-%m-%d")
            self.DB.commit()
            cur.close()
            self.DB.close()

            # 获取html网页信息
            browser = webdriver.Chrome(executable_path=self.executable_path, chrome_options=self.chrome_options)
            url = configManager.tender_urls["广州公共资源交易公共服务平台"]
            browser.get(url)
            time.sleep(2)
            browser.maximize_window()

            main_html = browser.page_source
            time.sleep(1)
            p_num = '<div class="green-black">.*?/共(.*?)页.*?</div>'
            num = re.findall(p_num, main_html, re.S)
            page_num = int(num[0]) - 1  # 减1防止索引溢出
            for i in range(page_num):
                print("<*> 正在解析第{0}个网页.....".format(i+1))
                # 解析时间,招标信息链接,标题,检查信息是否更新
                page_html = browser.page_source
                time.sleep(1)
                p_list = '<div class="prcont TableListLine".*?<ul>(.*?)<div class="pagesite">'
                li_list = re.findall(p_list, page_html, re.S)
                lis = ''.join(li_list)
                p_datetime = '<span class="floR colorCCC">(.*?)</span>'
                p_href = '<a href="(.*?)" target="_blank"'
                p_title = '<a href=".*?title="(.*?)">'
                datetimes = re.findall(p_datetime, lis, re.S)
                hrefs = re.findall(p_href, lis, re.S)
                titles = re.findall(p_title, lis, re.S)

                # 检查网页招标信息是否更新
                # 获取网页中最早和最晚时间
                html_time = [datetime.datetime.strptime(i.strip(),"%Y-%m-%d") for i in datetimes]
                #min_time = min(html_time)
                max_time = max(html_time)
                # 若数据库中上次的时间大于本次网页最大时间,则退出本次爬取
                if last_time >= max_time:
                    print('<-> 此网页招标信息未更新,本次爬取结束,处理已爬取网页')
                    break
                else:
                    for i in range(len(html_time)):
                        if html_time[i] > last_time:
                            datetimes[i] = datetimes[i]
                            hrefs[i] = hrefs[i]
                            titles[i] = titles[i]
                        else:
                            datetimes[i] = "none"
                            hrefs[i] = "none"
                            titles[i] = "none"
                    deadlines = []
                    budgets = []
                    for j in range(len(hrefs)):
                        time.sleep(1)
                        if hrefs[j] != "none":
                            User_Agent = random.choice(configManager.User_Agent)
                            headers = {"User-Agent": User_Agent}
                            content_html = requests.get(hrefs[j], headers=headers).text
                            p_budget = '预算金额.*?人民币(.*?)元'
                            p_deadline = '<a name="EB053b400bade94be288048635d81830c9">.*?">(.*?)</span>'
                            budget = re.findall(p_budget, content_html, re.S)
                            deadline = re.findall(p_deadline, content_html, re.S)
                            # 检查网页招标信息是否适合
                            keyword_count = 0
                            for keyword in self.keywords:
                                if keyword in content_html:
                                    keyword_count += 1
                            if keyword_count < 1:
                                print("<-> 此招标信息不符合公司业务,此次爬取终止,继续爬取下一页信息")
                                datetimes[j] = "none"
                                hrefs[j] = "none"
                                titles[j] = "none"
                                budgets.append("none")
                                deadlines.append("none")
                            else:
                                if len(deadline) != 0:
                                    deadlines.append(deadline[0])
                                else:
                                    deadlines.append("null")
                                if len(budget) != 0:
                                    budgets.append(budget[0])
                                else:
                                    budgets.append("null")

                datetimes_list.extend(datetimes)
                hrefs_list.extend(hrefs)
                titles_list.extend(titles)
                budgets_list.extend(budgets)
                deadlines_list.extend(deadlines)

                browser.find_element_by_xpath('/html/body/div[6]/div[2]/ul/div/div/div/a[10]').click()
            # 清除“none”值
            while "none" in datetimes_list:
                datetimes_list.remove("none")
            while "none" in hrefs_list:
                hrefs_list.remove("none")
            while "none" in titles_list:
                titles_list.remove("none")
            while "none" in budgets_list:
                budgets_list.remove("none")
            while "none" in deadlines_list:
                deadlines_list.remove("none")

            if len(deadlines_list) < 1:
                print("<-> 网页解析成功,但并无信息更新")
                update_flag = False
                resolved_data = [[], [], [], [], []]
                return resolved_data, update_flag
            else:
                print("<+> 网页解析成功,已有信息更新")
                update_flag = True
                resolved_data = [datetimes_list, hrefs_list, titles_list, budgets_list, deadlines_list]
                print('<+> 解析的信息条数:%d' % len(deadlines_list))
                return resolved_data, update_flag
        except:
            print('<-> 网页解析失败')
            traceback.print_exc()
            return [[], [], [], [], []], False

    def resovleSZSGGZYJYPT(self):
        try:
            print('<*> 正在解析网页.....')
            datetimes_list = []
            hrefs_list = []
            titles_list = []
            budgets_list = []
            deadlines_list = []
            # 获取数据库中上次的时间
            order = 'select max(datetimes) from tenders where source=%s'
            cur = self.DB.cursor()
            cur.execute(order,'深圳市公共资源交易平台')
            last_time = cur.fetchall()[0][0]
            last_time = datetime.datetime.strptime(last_time, "%Y-%m-%d %H:%M")
            self.DB.commit()
            cur.close()
            self.DB.close()

            # 获取html网页信息
            browser = webdriver.Chrome(executable_path=self.executable_path, chrome_options=self.chrome_options)
            url = configManager.tender_urls["深圳市公共资源交易平台"]
            browser.get(url)
            time.sleep(2)
            browser.maximize_window()

            main_html = browser.page_source
            time.sleep(3)
            p_num = '<div class="page">.*?总共(.*?)页</font>'
            num = re.findall(p_num, main_html, re.S)
            page_num = int(num[0]) - 1  # 减1防止索引溢出
            for i in range(page_num):
                print("<*> 正在解析第{0}个网页.....".format(i+1))
                # 解析时间,招标信息链接,标题,检查信息是否更新
                page_html = browser.page_source
                time.sleep(2)
                p_list = '<div class="tag-list4">(.*?)</div>'
                li_list = re.findall(p_list, page_html, re.S)
                lis = ''.join(li_list)
                p_datetime = '<span>(.*?)</span>'
                p_href = '<a href=".(.*?)"'
                p_title = '<a href=".*?title="(.*?)">'
                datetimes = re.findall(p_datetime, lis, re.S)
                hrefs = re.findall(p_href, lis, re.S)
                titles = re.findall(p_title, lis, re.S)
                # 检查网页招标信息是否更新
                # 获取网页中最早和最晚时间
                html_time = [datetime.datetime.strptime(i,"%Y-%m-%d") for i in datetimes]
                #min_time = min(html_time)
                max_time = max(html_time)
                # 若数据库中上次的时间大于本次网页最大时间,则退出本次爬取
                if last_time >= max_time:
                    print('<-> 此网页招标信息未更新,本次爬取结束,处理已爬取网页')
                    break
                else:
                    for j in range(len(html_time)):
                        if html_time[j] > last_time:
                            datetimes[j] = datetimes[j]
                            hrefs[j] = hrefs[j]
                            titles[j] = titles[j]
                        else:
                            datetimes[j] = "none"
                            hrefs[j] = "none"
                            titles[j] = "none"
                    # print(datetimes,hrefs,titles)
                    deadlines = []
                    budgets = []
                    for j in range(len(hrefs)):
                        time.sleep(1)
                        if hrefs[j] != "none":
                            full_href = 'http://ggzy.sz.gov.cn/cn/jyxx/zfcg/sbj/zbgg' + hrefs[j]
                            User_Agent = random.choice(configManager.User_Agent)
                            headers = {"User-Agent": User_Agent}
                            content_html = requests.get(full_href, headers=headers).text.encode('ISO-8859-1').decode('utf-8')

                            p_budget = '<td>.*?<p align="center">(.*?)</p>.*?</td>'
                            budgetlist = re.findall(p_budget, content_html, re.S)
                            budget = budgetlist[budgetlist.index(re.compile('预算*.?'))+1]
                            if budget < 10000:
                                budget = re.findall(p_budget, content_html, re.S)[-1]

                            p_deadline = '报价截止时间:</span><span id="holder">(.*?)</span>'
                            deadline = re.findall(p_deadline, content_html, re.S)
                            if len(deadline) < 1:
                                p_deadline = ''
                                deadline = re.findall(p_deadline, content_html, re.S)

                            print(budget,deadline)
            #
            #                 # 检查网页招标信息是否适合
            #                 keyword_count = 0
            #                 for keyword in self.keywords:
            #                     if keyword in content_html:
            #                         keyword_count += 1
            #                 if keyword_count < 1:
            #                     print("<-> 此招标信息不符合公司业务,此次爬取终止,继续爬取下一页信息")
            #                     datetimes[j] = "none"
            #                     hrefs[j] = "none"
            #                     titles[j] = "none"
            #                     budgets.append("none")
            #                     deadlines.append("none")
            #
            #                 else:
            #                     if len(deadline) != 0:
            #                         deadlines.append(deadline[0])
            #                     else:
            #                         deadlines.append("null")
            #                     if len(budget) != 0:
            #                         budgets.append(budget[0])
            #                     else:
            #                         budgets.append("null")
            #
            #                 # 检查网页招标信息是否符合资质
            #                 # qua_count = 0
            #                 # for qua in self.qualifications:
            #                 #     if qua in content_html:
            #                 #         qua_count += 1
            #                 # if qua_count < 1:
            #                 #     print("<-> 公司资质不符合此招标信息,此次爬取终止,继续爬取下一条信息")
            #                 #     datetimes[i] = "none"
            #                 #     hrefs[i] = "none"
            #                 #     titles[i] = "none"
            #                 #     budgets[i] = "none"
            #                 #     deadlines[i] = "none"
            #                 #     break
            #
            #     datetimes_list.extend(datetimes)
            #     hrefs_list.extend(hrefs)
            #     titles_list.extend(titles)
            #     budgets_list.extend(budgets)
            #     deadlines_list.extend(deadlines)
            #
                if i == 0:
                    browser.find_element_by_xpath('//*[@id="tagContent"]/div/div/div[4]/a[1]').click()
                    print('heheheh')
                browser.find_element_by_xpath('//*[@id="tagContent"]/div/div/div[4]/a[3]').click()

            #
            # #print(len(datetimes_list), len(hrefs_list), len(titles_list), len(budgets_list), len(deadlines_list))
            # # 清除“none”值
            # while "none" in datetimes_list:
            #     datetimes_list.remove("none")
            # while "none" in hrefs_list:
            #     hrefs_list.remove("none")
            # while "none" in titles_list:
            #     titles_list.remove("none")
            # while "none" in budgets_list:
            #     budgets_list.remove("none")
            # while "none" in deadlines_list:
            #     deadlines_list.remove("none")
            #
            # if len(deadlines_list) < 1:
            #     print("<-> 网页解析成功,但并无信息更新")
            #     update_flag = False
            #     resolved_data = [[],[],[],[],[]]
            #     return resolved_data, update_flag
            # else:
            #     print("<+> 网页解析成功,已有信息更新")
            #     update_flag = True
            #     resolved_data = [datetimes_list, hrefs_list, titles_list, budgets_list, deadlines_list]
            #     #print(len(datetimes_list),len(hrefs_list),len(titles_list),len(budgets_list),len(deadlines_list))
            #     print('<+> 解析的信息条数:%d'%len(deadlines_list))
            #     return resolved_data, update_flag

        except:
            print('<-> 网页解析失败')
            traceback.print_exc()
            return [[],[],[],[],[]],False