Beispiel #1
0
    def crawlProxy(self):
        """
        爬取、解析、清洗代理IP,并存储到数据库中
        :return: 代理 IP()
        """
        try:
            print("<*> 开始爬取数据")
            headers = {"User-Agent":self.User_Agent}
            # res = requests.get(self.proxy_url+'/nn/1', headers=headers).text
            # p_nums = '<a href="/nn/.*?">(.*?)</a>'
            # page_nums = re.findall(p_nums, res, re.S)
            # page_num = int(page_nums[3])
            page_num = 3
            for page in range(page_num):
                proxy_list = []
                proxy_url_n = self.proxy_url + '/nn/' + str(page)
                html = requests.get(proxy_url_n, headers=headers, timeout=20).text
                selector = Selector(text=html)
                all_trs = selector.css("#ip_list tr")
                for tr in all_trs[1:]:
                    speed_str = tr.css(".bar::attr(title)").extract()[0]
                    if speed_str:
                        speed = float(speed_str.split("秒")[0])
                    else:
                        speed = "none"
                    all_texts = tr.css("td::text").extract()

                    ip = all_texts[0]
                    port = all_texts[1]
                    protocal = all_texts[5]
                    proxy_list.append((protocal, ip, port, speed))
                print("<+> 第%d页爬取成功" %page)

                print("<*> 开始存储第%d页代理" %page)
                DB = MysqlOperate().db
                cur = DB.cursor()
                for proxy in proxy_list:
                    order = 'insert into proxy values(%s,%s,%s,%s)'
                    cur.execute(order, (proxy[0],proxy[1],proxy[2],proxy[3]))
                    DB.commit()
                    print("<+> 存储成功:{0}://{1}:{2}-{3}".format(proxy[0],proxy[1],proxy[2],proxy[3]))
                print("<+> 第%d页存储成功" %page)
                cur.close()
                DB.close()

        except:
            print("<-> 爬取失败")
            traceback.print_exc()
Beispiel #2
0
    def __init__(self):
        print('<*> 正在实例化网页解析器.....')

        self.executable_path=r'D:\Anaconda3\Scripts\chromedriver.exe'
        # self.executable_path = r'E:\ProgramData\Anaconda3\Scripts\chromedriver.exe'

        self.chrome_options = webdriver.ChromeOptions()
        self.chrome_options.add_argument('--headless')
        # proxy = ProxyPool().getProxy()
        # chrome_options.add_argument("--proxy-server={0}://{1}:{2}".format
        #                             (proxy[0], proxy[1], proxy[2]))


        self.DB = MysqlOperate().db
        self.keywords = configManager.keywords
        self.qualifications = configManager.qualifications
Beispiel #3
0
 def rebuildProxy(self):
     """
     重建代理池
     :return:
     """
     print("<*> 开始清理代理池")
     try:
         # 清除表内容,不能用drop方法,这样会将整个表删除
         DB = MysqlOperate().db
         cur = DB.cursor()
         cur.execute('truncate proxy')
         DB.commit()
         cur.close()
         DB.close()
         print("<+> 清理完成")
         self.crawlProxy()
     except:
         print("<-> 清理失败")
         traceback.print_exc()
Beispiel #4
0
    def storeSZSGGZYJYPT(self, data):
        try:
            datetimes_list, hrefs_list, titles_list, budgets_list, deadlines_list = data[0], data[1], data[2], data[3], \
                                                                                    data[4]

            print("<*> 开始存储数据.....")
            DB = MysqlOperate().db
            cur = DB.cursor()
            for i in range(len(hrefs_list)):
                order = 'insert into tenders values(%s,%s,%s,%s,%s,%s,%s)'
                cur.execute(order, (
                '深圳市公共资源交易平台',datetimes_list[i], titles_list[i], budgets_list[i], deadlines_list[i], hrefs_list[i],"no"))
            DB.commit()
            cur.close()
            DB.close()
            print("<+> 数据存储成功")
        except:
            print("<-> 数据存储失败")
            traceback.print_exc()
Beispiel #5
0
 def getProxy(self):
     """
     从数据库中随机获取一个可以用的代理
     :return:返回字典类型的取值结果
     """
     try:
         print("<*> 正在从数据库中随机获取代理 IP")
         DB = MysqlOperate().db
         cur = DB.cursor()
         order = 'select * from proxy ORDER BY RAND() LIMIT 1'
         cur.execute(order)
         random_proxy = cur.fetchall()[0]
         DB.commit()
         cur.close()
         DB.close()
         print("<+> 随机IP为:  {0}:{1}:{2}".format(random_proxy[0],random_proxy[1], random_proxy[2]))
         print("<+> 获取代理成功")
         return random_proxy
     except:
         print("<-> 获取代理失败")
         traceback.print_exc()
Beispiel #6
0
    def storeGDSZFCGW(self, data):
        try:
            datetimes_list, hrefs_list, titles_list, budgets_list, deadlines_list = data[0], data[1], data[2], data[3], \
                                                                                    data[4]

            print("<*> 开始存储数据.....")
            DB = MysqlOperate().db
            cur = DB.cursor()
            for i in range(len(hrefs_list)):
                order = 'insert into tenders values(%s,%s,%s,%s,%s,%s,%s)'
                cur.execute(order, (
                '广东省政府采购网',datetimes_list[i], titles_list[i], budgets_list[i], deadlines_list[i], hrefs_list[i],"no"))
            DB.commit()
            cur.close()
            DB.close()
                # data_dict = {"datetimes":datetimes_list[i], "href":hrefs_list[i], "titles":titles_list[i],
                #              "budgets":budgets_list[i], "deadlines":deadlines_list[i], "send":"no"}
                # self.DB.insert(data_dict)

            print("<+> 数据存储成功")
        except:
            print("<-> 数据存储失败")
            traceback.print_exc()
Beispiel #7
0
    def writeMail(self):
        # 获取上次存储的时间
        # tender_info = dbOperation.MongoOperate().collection1
        # last_time = end
        # tender_time = tender_info.find({"datetimes": {"$gt": last_time}}, {"_id": 0, "datetimes":1})
        # datetimes_list = [datetime.strptime(i,"%Y-%m-%d %H:%M") for i in tender_time]
        # start = datetime.strftime(min(datetimes_list),"%Y-%m-%d %H:%M")
        # end = datetime.strftime(max(datetimes_list),"%Y-%m-%d %H:%M")
        # 获取更新时间内的信息,装换为DataFrame格式
        # tender = tender_info.find_one({},{"_id": 0})
        # tender_df = pd.DataFrame(tender,index=[0])
        #tenders = tender_info.find({"datetimes":{"$gte":start,"$lte":end}},{"_id":0})
        # tenders = tender_info.find({"send":"no"},{"_id":0})
        # for t in tenders:
        #     t_s = pd.Series(t)
        #     tender_df = tender_df.append(t_s, ignore_index=True)
        # tender_info.update({"send":"no"},{"$set":{"send":"yes"}})

        # 获取发送字段为no的记录
        DB = MysqlOperate().db
        cur = DB.cursor()
        order1 = 'select * from tenders where send=%s'
        cur.execute(order1, ("no"))
        tender_list = list(cur.fetchall())
        # 获取更新时间内的信息,装换为DataFrame格式
        tender_df = pd.DataFrame(tender_list,
                                 columns=[
                                     'source', 'datetimes', 'titles',
                                     'budgets', 'deadlines', 'hrefs', 'send'
                                 ])
        # 获取发送字段为yes
        order2 = 'update tenders set send=%s where send=%s'
        cur.execute(order2, ("yes", "no"))
        DB.commit()
        cur.close()
        DB.close()

        end = max(tender_df['datetimes']).split(' ')[0]
        start = min(tender_df['datetimes']).split(' ')[0]

        # 生成邮件正文文字
        mail_text = """
        <p><b>以下内容为{0}至{1}的招投标信息,详情请看Excel附件。</b></p>
        """.format(start, end)
        # 生成邮件正文表格
        old_width = pd.get_option('display.max_colwidth')  # 先把原来的显示列宽存下来
        pd.set_option('display.max_colwidth',
                      -1)  # 设置一个新的列宽,宽度是-1,应该是显示宽度去自适应内容
        mail_table = tender_df.to_html(
            escape=False,
            index=False,
            sparsify=True,
            border=2,
            index_names=False
        )  # escape是指,要不要把单元格的内容中那些像html的标签,就写成html的标签,否则,就是纯文本形式。
        pd.set_option('display.max_colwidth', old_width)  # 把df的显示列宽恢复到默认值

        # 生成Excel邮件附件
        print("<*> 正在生成招投标信息文件.....")
        #attachment_path = "H:\\DIBS\\tenders\\tenders_%s %s.xlsx"%(start,end)
        attachment_path = "H:\\DIBS\\tenders\\tenders_{0}~{1}.xlsx".format(
            start, end)
        try:
            tender_df.to_excel(attachment_path, index=False)
            print("<+> 生成招投标信息文件成功")
        except:
            print("<-> 生成招投标信息文件失败")
            traceback.print_exc()
        return mail_text, mail_table, attachment_path