def crawlProxy(self): """ 爬取、解析、清洗代理IP,并存储到数据库中 :return: 代理 IP() """ try: print("<*> 开始爬取数据") headers = {"User-Agent":self.User_Agent} # res = requests.get(self.proxy_url+'/nn/1', headers=headers).text # p_nums = '<a href="/nn/.*?">(.*?)</a>' # page_nums = re.findall(p_nums, res, re.S) # page_num = int(page_nums[3]) page_num = 3 for page in range(page_num): proxy_list = [] proxy_url_n = self.proxy_url + '/nn/' + str(page) html = requests.get(proxy_url_n, headers=headers, timeout=20).text selector = Selector(text=html) all_trs = selector.css("#ip_list tr") for tr in all_trs[1:]: speed_str = tr.css(".bar::attr(title)").extract()[0] if speed_str: speed = float(speed_str.split("秒")[0]) else: speed = "none" all_texts = tr.css("td::text").extract() ip = all_texts[0] port = all_texts[1] protocal = all_texts[5] proxy_list.append((protocal, ip, port, speed)) print("<+> 第%d页爬取成功" %page) print("<*> 开始存储第%d页代理" %page) DB = MysqlOperate().db cur = DB.cursor() for proxy in proxy_list: order = 'insert into proxy values(%s,%s,%s,%s)' cur.execute(order, (proxy[0],proxy[1],proxy[2],proxy[3])) DB.commit() print("<+> 存储成功:{0}://{1}:{2}-{3}".format(proxy[0],proxy[1],proxy[2],proxy[3])) print("<+> 第%d页存储成功" %page) cur.close() DB.close() except: print("<-> 爬取失败") traceback.print_exc()
def rebuildProxy(self): """ 重建代理池 :return: """ print("<*> 开始清理代理池") try: # 清除表内容,不能用drop方法,这样会将整个表删除 DB = MysqlOperate().db cur = DB.cursor() cur.execute('truncate proxy') DB.commit() cur.close() DB.close() print("<+> 清理完成") self.crawlProxy() except: print("<-> 清理失败") traceback.print_exc()
def storeSZSGGZYJYPT(self, data): try: datetimes_list, hrefs_list, titles_list, budgets_list, deadlines_list = data[0], data[1], data[2], data[3], \ data[4] print("<*> 开始存储数据.....") DB = MysqlOperate().db cur = DB.cursor() for i in range(len(hrefs_list)): order = 'insert into tenders values(%s,%s,%s,%s,%s,%s,%s)' cur.execute(order, ( '深圳市公共资源交易平台',datetimes_list[i], titles_list[i], budgets_list[i], deadlines_list[i], hrefs_list[i],"no")) DB.commit() cur.close() DB.close() print("<+> 数据存储成功") except: print("<-> 数据存储失败") traceback.print_exc()
def getProxy(self): """ 从数据库中随机获取一个可以用的代理 :return:返回字典类型的取值结果 """ try: print("<*> 正在从数据库中随机获取代理 IP") DB = MysqlOperate().db cur = DB.cursor() order = 'select * from proxy ORDER BY RAND() LIMIT 1' cur.execute(order) random_proxy = cur.fetchall()[0] DB.commit() cur.close() DB.close() print("<+> 随机IP为: {0}:{1}:{2}".format(random_proxy[0],random_proxy[1], random_proxy[2])) print("<+> 获取代理成功") return random_proxy except: print("<-> 获取代理失败") traceback.print_exc()
def storeGDSZFCGW(self, data): try: datetimes_list, hrefs_list, titles_list, budgets_list, deadlines_list = data[0], data[1], data[2], data[3], \ data[4] print("<*> 开始存储数据.....") DB = MysqlOperate().db cur = DB.cursor() for i in range(len(hrefs_list)): order = 'insert into tenders values(%s,%s,%s,%s,%s,%s,%s)' cur.execute(order, ( '广东省政府采购网',datetimes_list[i], titles_list[i], budgets_list[i], deadlines_list[i], hrefs_list[i],"no")) DB.commit() cur.close() DB.close() # data_dict = {"datetimes":datetimes_list[i], "href":hrefs_list[i], "titles":titles_list[i], # "budgets":budgets_list[i], "deadlines":deadlines_list[i], "send":"no"} # self.DB.insert(data_dict) print("<+> 数据存储成功") except: print("<-> 数据存储失败") traceback.print_exc()
def writeMail(self): # 获取上次存储的时间 # tender_info = dbOperation.MongoOperate().collection1 # last_time = end # tender_time = tender_info.find({"datetimes": {"$gt": last_time}}, {"_id": 0, "datetimes":1}) # datetimes_list = [datetime.strptime(i,"%Y-%m-%d %H:%M") for i in tender_time] # start = datetime.strftime(min(datetimes_list),"%Y-%m-%d %H:%M") # end = datetime.strftime(max(datetimes_list),"%Y-%m-%d %H:%M") # 获取更新时间内的信息,装换为DataFrame格式 # tender = tender_info.find_one({},{"_id": 0}) # tender_df = pd.DataFrame(tender,index=[0]) #tenders = tender_info.find({"datetimes":{"$gte":start,"$lte":end}},{"_id":0}) # tenders = tender_info.find({"send":"no"},{"_id":0}) # for t in tenders: # t_s = pd.Series(t) # tender_df = tender_df.append(t_s, ignore_index=True) # tender_info.update({"send":"no"},{"$set":{"send":"yes"}}) # 获取发送字段为no的记录 DB = MysqlOperate().db cur = DB.cursor() order1 = 'select * from tenders where send=%s' cur.execute(order1, ("no")) tender_list = list(cur.fetchall()) # 获取更新时间内的信息,装换为DataFrame格式 tender_df = pd.DataFrame(tender_list, columns=[ 'source', 'datetimes', 'titles', 'budgets', 'deadlines', 'hrefs', 'send' ]) # 获取发送字段为yes order2 = 'update tenders set send=%s where send=%s' cur.execute(order2, ("yes", "no")) DB.commit() cur.close() DB.close() end = max(tender_df['datetimes']).split(' ')[0] start = min(tender_df['datetimes']).split(' ')[0] # 生成邮件正文文字 mail_text = """ <p><b>以下内容为{0}至{1}的招投标信息,详情请看Excel附件。</b></p> """.format(start, end) # 生成邮件正文表格 old_width = pd.get_option('display.max_colwidth') # 先把原来的显示列宽存下来 pd.set_option('display.max_colwidth', -1) # 设置一个新的列宽,宽度是-1,应该是显示宽度去自适应内容 mail_table = tender_df.to_html( escape=False, index=False, sparsify=True, border=2, index_names=False ) # escape是指,要不要把单元格的内容中那些像html的标签,就写成html的标签,否则,就是纯文本形式。 pd.set_option('display.max_colwidth', old_width) # 把df的显示列宽恢复到默认值 # 生成Excel邮件附件 print("<*> 正在生成招投标信息文件.....") #attachment_path = "H:\\DIBS\\tenders\\tenders_%s %s.xlsx"%(start,end) attachment_path = "H:\\DIBS\\tenders\\tenders_{0}~{1}.xlsx".format( start, end) try: tender_df.to_excel(attachment_path, index=False) print("<+> 生成招投标信息文件成功") except: print("<-> 生成招投标信息文件失败") traceback.print_exc() return mail_text, mail_table, attachment_path
class PageResolver(object): def __init__(self): print('<*> 正在实例化网页解析器.....') self.executable_path=r'D:\Anaconda3\Scripts\chromedriver.exe' # self.executable_path = r'E:\ProgramData\Anaconda3\Scripts\chromedriver.exe' self.chrome_options = webdriver.ChromeOptions() self.chrome_options.add_argument('--headless') # proxy = ProxyPool().getProxy() # chrome_options.add_argument("--proxy-server={0}://{1}:{2}".format # (proxy[0], proxy[1], proxy[2])) self.DB = MysqlOperate().db self.keywords = configManager.keywords self.qualifications = configManager.qualifications def resovleGDSZFCGW(self): try: print('<*> 正在解析网页.....') datetimes_list = [] hrefs_list = [] titles_list = [] budgets_list = [] deadlines_list = [] # 获取数据库中上次的时间 # last_time = self.table.find({}, {"datetimes": 1, "_id": 0}).sort("datetimes", -1).limit(1) # datetime_list = list(last_time) # last_time = datetime_list[0]["datetimes"] # last_time = datetime.datetime.strptime(last_time, "%Y-%m-%d %H:%M") order = 'select max(datetimes) from tenders where source=%s' cur = self.DB.cursor() cur.execute(order,'广东省政府采购网') last_time = cur.fetchall()[0][0] last_time = datetime.datetime.strptime(last_time, "%Y-%m-%d %H:%M") self.DB.commit() cur.close() self.DB.close() # 获取html网页信息 browser = webdriver.Chrome(executable_path=self.executable_path, chrome_options=self.chrome_options) url = configManager.tender_urls["广东省政府采购网"] browser.get(url) time.sleep(2) browser.maximize_window() main_html = browser.page_source time.sleep(2) p_num = '<span class="aspan">(.*?)</span>' num = re.findall(p_num, main_html, re.S) page_num = int(num[-1]) - 1 # 减1防止索引溢出 for i in range(page_num): print("<*> 正在解析第{0}个网页.....".format(i+1)) # 解析时间,招标信息链接,标题,检查信息是否更新 page_html = browser.page_source time.sleep(2) p_list = '<ul class="m_m_c_list">(.*?)</ul>' li_list = re.findall(p_list, page_html, re.S) lis = ''.join(li_list) p_datetime = '<em>(.*?)</em>' p_href = '<a href="/showNotice/id/(.*?).html"' p_title = '<a href=".*?title="(.*?)">' datetimes = re.findall(p_datetime, lis, re.S) hrefs = re.findall(p_href, lis, re.S) titles = re.findall(p_title, lis, re.S) # 检查网页招标信息是否更新 # 获取网页中最早和最晚时间 html_time = [datetime.datetime.strptime(i,"%Y-%m-%d %H:%M") for i in datetimes] #min_time = min(html_time) max_time = max(html_time) # 若数据库中上次的时间大于本次网页最大时间,则退出本次爬取 if last_time >= max_time: print('<-> 此网页招标信息未更新,本次爬取结束,处理已爬取网页') break else: for i in range(len(html_time)): if html_time[i] > last_time: datetimes[i] = datetimes[i] hrefs[i] = hrefs[i] titles[i] = titles[i] else: datetimes[i] = "none" hrefs[i] = "none" titles[i] = "none" deadlines = [] budgets = [] for j in range(len(hrefs)): time.sleep(1) if hrefs[j] != "none": full_href = 'http://www.gdgpo.gov.cn/showNotice/id/' + hrefs[j] + '.html' User_Agent = random.choice(configManager.User_Agent) headers = {"User-Agent": User_Agent} content_html = requests.get(full_href, headers=headers).text p_budget = '<div class="zw_c_c_qx">.*?<span>预算金额:(.*?)元</span>.*?</div>' p_deadline = '<p style="text-indent:.*?截止时间:(.*?)</span></p>' budget = re.findall(p_budget, content_html, re.S) deadline = re.findall(p_deadline, content_html, re.S) # 检查网页招标信息是否适合 keyword_count = 0 for keyword in self.keywords: if keyword in content_html: keyword_count += 1 if keyword_count < 1: print("<-> 此招标信息不符合公司业务,此次爬取终止,继续爬取下一页信息") datetimes[j] = "none" hrefs[j] = "none" titles[j] = "none" budgets.append("none") deadlines.append("none") else: if len(deadline) != 0: deadlines.append(deadline[0]) else: deadlines.append("null") if len(budget) != 0: budgets.append(budget[0]) else: budgets.append("null") # 检查网页招标信息是否符合资质 # qua_count = 0 # for qua in self.qualifications: # if qua in content_html: # qua_count += 1 # if qua_count < 1: # print("<-> 公司资质不符合此招标信息,此次爬取终止,继续爬取下一条信息") # datetimes[i] = "none" # hrefs[i] = "none" # titles[i] = "none" # budgets[i] = "none" # deadlines[i] = "none" # break datetimes_list.extend(datetimes) hrefs_list.extend(hrefs) titles_list.extend(titles) budgets_list.extend(budgets) deadlines_list.extend(deadlines) browser.find_element_by_xpath('//*[@id="contianer"]/div[3]/div[2]/div[3]/div/form/a[8]/span').click() browser.quit() # 清除“none”值 while "none" in datetimes_list: datetimes_list.remove("none") while "none" in hrefs_list: hrefs_list.remove("none") while "none" in titles_list: titles_list.remove("none") while "none" in budgets_list: budgets_list.remove("none") while "none" in deadlines_list: deadlines_list.remove("none") if len(deadlines_list) < 1: print("<-> 网页解析成功,但并无信息更新") update_flag = False resolved_data = [[],[],[],[],[]] return resolved_data, update_flag else: print("<+> 网页解析成功,已有信息更新") update_flag = True resolved_data = [datetimes_list, hrefs_list, titles_list, budgets_list, deadlines_list] #print(len(datetimes_list),len(hrefs_list),len(titles_list),len(budgets_list),len(deadlines_list)) print('<+> 解析的信息条数:%d'%len(deadlines_list)) return resolved_data, update_flag except: print('<-> 网页解析失败') traceback.print_exc() return [[],[],[],[],[]],False def resovleGZGGZYJYGGFWPT(self): try: print('<*> 正在解析网页.....') datetimes_list = [] hrefs_list = [] titles_list = [] budgets_list = [] deadlines_list = [] # 获取数据库中上次的时间 order = 'select max(datetimes) from tenders where source=%s' cur = self.DB.cursor() cur.execute(order,'广州公共资源交易公共服务平台') last_time = cur.fetchall()[0][0] try: last_time = datetime.datetime.strptime(last_time, "%Y-%m-%d %H:%M") except: last_time = datetime.datetime.strptime(last_time, "%Y-%m-%d") self.DB.commit() cur.close() self.DB.close() # 获取html网页信息 browser = webdriver.Chrome(executable_path=self.executable_path, chrome_options=self.chrome_options) url = configManager.tender_urls["广州公共资源交易公共服务平台"] browser.get(url) time.sleep(2) browser.maximize_window() main_html = browser.page_source time.sleep(1) p_num = '<div class="green-black">.*?/共(.*?)页.*?</div>' num = re.findall(p_num, main_html, re.S) page_num = int(num[0]) - 1 # 减1防止索引溢出 for i in range(page_num): print("<*> 正在解析第{0}个网页.....".format(i+1)) # 解析时间,招标信息链接,标题,检查信息是否更新 page_html = browser.page_source time.sleep(1) p_list = '<div class="prcont TableListLine".*?<ul>(.*?)<div class="pagesite">' li_list = re.findall(p_list, page_html, re.S) lis = ''.join(li_list) p_datetime = '<span class="floR colorCCC">(.*?)</span>' p_href = '<a href="(.*?)" target="_blank"' p_title = '<a href=".*?title="(.*?)">' datetimes = re.findall(p_datetime, lis, re.S) hrefs = re.findall(p_href, lis, re.S) titles = re.findall(p_title, lis, re.S) # 检查网页招标信息是否更新 # 获取网页中最早和最晚时间 html_time = [datetime.datetime.strptime(i.strip(),"%Y-%m-%d") for i in datetimes] #min_time = min(html_time) max_time = max(html_time) # 若数据库中上次的时间大于本次网页最大时间,则退出本次爬取 if last_time >= max_time: print('<-> 此网页招标信息未更新,本次爬取结束,处理已爬取网页') break else: for i in range(len(html_time)): if html_time[i] > last_time: datetimes[i] = datetimes[i] hrefs[i] = hrefs[i] titles[i] = titles[i] else: datetimes[i] = "none" hrefs[i] = "none" titles[i] = "none" deadlines = [] budgets = [] for j in range(len(hrefs)): time.sleep(1) if hrefs[j] != "none": User_Agent = random.choice(configManager.User_Agent) headers = {"User-Agent": User_Agent} content_html = requests.get(hrefs[j], headers=headers).text p_budget = '预算金额.*?人民币(.*?)元' p_deadline = '<a name="EB053b400bade94be288048635d81830c9">.*?">(.*?)</span>' budget = re.findall(p_budget, content_html, re.S) deadline = re.findall(p_deadline, content_html, re.S) # 检查网页招标信息是否适合 keyword_count = 0 for keyword in self.keywords: if keyword in content_html: keyword_count += 1 if keyword_count < 1: print("<-> 此招标信息不符合公司业务,此次爬取终止,继续爬取下一页信息") datetimes[j] = "none" hrefs[j] = "none" titles[j] = "none" budgets.append("none") deadlines.append("none") else: if len(deadline) != 0: deadlines.append(deadline[0]) else: deadlines.append("null") if len(budget) != 0: budgets.append(budget[0]) else: budgets.append("null") datetimes_list.extend(datetimes) hrefs_list.extend(hrefs) titles_list.extend(titles) budgets_list.extend(budgets) deadlines_list.extend(deadlines) browser.find_element_by_xpath('/html/body/div[6]/div[2]/ul/div/div/div/a[10]').click() # 清除“none”值 while "none" in datetimes_list: datetimes_list.remove("none") while "none" in hrefs_list: hrefs_list.remove("none") while "none" in titles_list: titles_list.remove("none") while "none" in budgets_list: budgets_list.remove("none") while "none" in deadlines_list: deadlines_list.remove("none") if len(deadlines_list) < 1: print("<-> 网页解析成功,但并无信息更新") update_flag = False resolved_data = [[], [], [], [], []] return resolved_data, update_flag else: print("<+> 网页解析成功,已有信息更新") update_flag = True resolved_data = [datetimes_list, hrefs_list, titles_list, budgets_list, deadlines_list] print('<+> 解析的信息条数:%d' % len(deadlines_list)) return resolved_data, update_flag except: print('<-> 网页解析失败') traceback.print_exc() return [[], [], [], [], []], False def resovleSZSGGZYJYPT(self): try: print('<*> 正在解析网页.....') datetimes_list = [] hrefs_list = [] titles_list = [] budgets_list = [] deadlines_list = [] # 获取数据库中上次的时间 order = 'select max(datetimes) from tenders where source=%s' cur = self.DB.cursor() cur.execute(order,'深圳市公共资源交易平台') last_time = cur.fetchall()[0][0] last_time = datetime.datetime.strptime(last_time, "%Y-%m-%d %H:%M") self.DB.commit() cur.close() self.DB.close() # 获取html网页信息 browser = webdriver.Chrome(executable_path=self.executable_path, chrome_options=self.chrome_options) url = configManager.tender_urls["深圳市公共资源交易平台"] browser.get(url) time.sleep(2) browser.maximize_window() main_html = browser.page_source time.sleep(3) p_num = '<div class="page">.*?总共(.*?)页</font>' num = re.findall(p_num, main_html, re.S) page_num = int(num[0]) - 1 # 减1防止索引溢出 for i in range(page_num): print("<*> 正在解析第{0}个网页.....".format(i+1)) # 解析时间,招标信息链接,标题,检查信息是否更新 page_html = browser.page_source time.sleep(2) p_list = '<div class="tag-list4">(.*?)</div>' li_list = re.findall(p_list, page_html, re.S) lis = ''.join(li_list) p_datetime = '<span>(.*?)</span>' p_href = '<a href=".(.*?)"' p_title = '<a href=".*?title="(.*?)">' datetimes = re.findall(p_datetime, lis, re.S) hrefs = re.findall(p_href, lis, re.S) titles = re.findall(p_title, lis, re.S) # 检查网页招标信息是否更新 # 获取网页中最早和最晚时间 html_time = [datetime.datetime.strptime(i,"%Y-%m-%d") for i in datetimes] #min_time = min(html_time) max_time = max(html_time) # 若数据库中上次的时间大于本次网页最大时间,则退出本次爬取 if last_time >= max_time: print('<-> 此网页招标信息未更新,本次爬取结束,处理已爬取网页') break else: for j in range(len(html_time)): if html_time[j] > last_time: datetimes[j] = datetimes[j] hrefs[j] = hrefs[j] titles[j] = titles[j] else: datetimes[j] = "none" hrefs[j] = "none" titles[j] = "none" # print(datetimes,hrefs,titles) deadlines = [] budgets = [] for j in range(len(hrefs)): time.sleep(1) if hrefs[j] != "none": full_href = 'http://ggzy.sz.gov.cn/cn/jyxx/zfcg/sbj/zbgg' + hrefs[j] User_Agent = random.choice(configManager.User_Agent) headers = {"User-Agent": User_Agent} content_html = requests.get(full_href, headers=headers).text.encode('ISO-8859-1').decode('utf-8') p_budget = '<td>.*?<p align="center">(.*?)</p>.*?</td>' budgetlist = re.findall(p_budget, content_html, re.S) budget = budgetlist[budgetlist.index(re.compile('预算*.?'))+1] if budget < 10000: budget = re.findall(p_budget, content_html, re.S)[-1] p_deadline = '报价截止时间:</span><span id="holder">(.*?)</span>' deadline = re.findall(p_deadline, content_html, re.S) if len(deadline) < 1: p_deadline = '' deadline = re.findall(p_deadline, content_html, re.S) print(budget,deadline) # # # 检查网页招标信息是否适合 # keyword_count = 0 # for keyword in self.keywords: # if keyword in content_html: # keyword_count += 1 # if keyword_count < 1: # print("<-> 此招标信息不符合公司业务,此次爬取终止,继续爬取下一页信息") # datetimes[j] = "none" # hrefs[j] = "none" # titles[j] = "none" # budgets.append("none") # deadlines.append("none") # # else: # if len(deadline) != 0: # deadlines.append(deadline[0]) # else: # deadlines.append("null") # if len(budget) != 0: # budgets.append(budget[0]) # else: # budgets.append("null") # # # 检查网页招标信息是否符合资质 # # qua_count = 0 # # for qua in self.qualifications: # # if qua in content_html: # # qua_count += 1 # # if qua_count < 1: # # print("<-> 公司资质不符合此招标信息,此次爬取终止,继续爬取下一条信息") # # datetimes[i] = "none" # # hrefs[i] = "none" # # titles[i] = "none" # # budgets[i] = "none" # # deadlines[i] = "none" # # break # # datetimes_list.extend(datetimes) # hrefs_list.extend(hrefs) # titles_list.extend(titles) # budgets_list.extend(budgets) # deadlines_list.extend(deadlines) # if i == 0: browser.find_element_by_xpath('//*[@id="tagContent"]/div/div/div[4]/a[1]').click() print('heheheh') browser.find_element_by_xpath('//*[@id="tagContent"]/div/div/div[4]/a[3]').click() # # #print(len(datetimes_list), len(hrefs_list), len(titles_list), len(budgets_list), len(deadlines_list)) # # 清除“none”值 # while "none" in datetimes_list: # datetimes_list.remove("none") # while "none" in hrefs_list: # hrefs_list.remove("none") # while "none" in titles_list: # titles_list.remove("none") # while "none" in budgets_list: # budgets_list.remove("none") # while "none" in deadlines_list: # deadlines_list.remove("none") # # if len(deadlines_list) < 1: # print("<-> 网页解析成功,但并无信息更新") # update_flag = False # resolved_data = [[],[],[],[],[]] # return resolved_data, update_flag # else: # print("<+> 网页解析成功,已有信息更新") # update_flag = True # resolved_data = [datetimes_list, hrefs_list, titles_list, budgets_list, deadlines_list] # #print(len(datetimes_list),len(hrefs_list),len(titles_list),len(budgets_list),len(deadlines_list)) # print('<+> 解析的信息条数:%d'%len(deadlines_list)) # return resolved_data, update_flag except: print('<-> 网页解析失败') traceback.print_exc() return [[],[],[],[],[]],False