def get_detail_info(url, headers, proxies, ipdata, msg): childresponse = get_response_get(url, headers, proxies, ipdata, msg) if childresponse: print('详情页:', url) selector = etree.HTML(childresponse.text, etree.HTMLParser()) supplier_info = {} supplier_info.update({'source_url': url}) try: brief_info = selector.xpath( '//div[@class="dljgContainer"]/div[@class="dljg_infor"]/div/ul/li/p/span') for i in range(0, len(brief_info), 2): # 获取了主要信息 title = brief_info[i].xpath('text()')[0].strip() value = brief_info[i + 1].xpath('text()')[0].strip() supplier_info.update({title: value}) except Exception as e: fail_time = datetime.datetime.now() fail_time = trans_date_str(fail_time) with open('failed_url.txt', 'a', encoding='utf-8')as f: f.write(f'{fail_time}:<{childresponse.request.url}>:该网址解析xpath失败:<{e}>') # 获取基本信息 3个表... try: table_info = selector.xpath('//div[@class="byf_table_jibenxinxi"]/table[1]') for table_ele in table_info: tr_ele = table_ele.xpath('tr') if len(tr_ele) > 1: for i in range(0, len(tr_ele)): detail_info = tr_ele[i].xpath('td/text()') if len(detail_info) > 1: for i in range(0, len(detail_info), 2): title = detail_info[i] value = detail_info[i + 1] supplier_info.update({title: value}) table_info_type2 = selector.xpath( '//div[@class="byf_table_jibenxinxi"]/table[position()>1]') for table_info in table_info_type2: tr_ele = table_info.xpath('tr') if len(tr_ele) > 1: title = tr_ele[0] titles = title.xpath('td') titles = [i.xpath('text()')[0] for i in titles] for value_ele in tr_ele[1:]: values = value_ele.xpath('td') values = [i.xpath('text()')[0] if len(i.xpath('text()')) > 0 else '' for i in values] supplier_info.update(dict(zip(titles, values))) except Exception as e: fail_time = datetime.datetime.now() fail_time = trans_date_str(fail_time) with open('failed_url.txt', 'a', encoding='utf-8')as f: f.write(f'{fail_time}:<{childresponse.request.url}>:该网址解析xpath失败:<{e}>') with open('government_procurement/neimenggu/supplier.txt', 'a', encoding='utf-8')as f: f.write(json.dumps(supplier_info, ensure_ascii=False)) f.write(',')
def get_response(url, proxies, ipdata, msg, trynum=0): if trynum == 10: # 连续更换10次ip均出错 return None try: response = requests.get(url=url, headers=headers, proxies=proxies, timeout=20, allow_redirects=False) if response.status_code == 200: return response elif response.status_code == 404: ipdata.update_ipdata(msg) # 删除当前ip msg = ipdata.get_ipdata() proxies = ipdata.get_proxy(msg) trynum += 1 return get_response(url, proxies, ipdata, msg, trynum=trynum) else: fail_time = datetime.datetime.now() fail_time = trans_date_str(fail_time) with open('failed_url.txt', 'a', encoding='utf-8') as f: f.write( f'{fail_time}:<{url}>: 该网站获取失败: <{response.status_code}>\n' ) return None except requests.exceptions.ProxyError: # 代理ip被拒绝访问 ipdata.update_ipdata(msg) # 删除当前ip msg = ipdata.get_ipdata() proxies = ipdata.get_proxy(msg) trynum += 1 return get_response(url, proxies, ipdata, msg, trynum=trynum) except requests.exceptions.Timeout: ipdata.update_ipdata(msg) # 删除当前ip msg = ipdata.get_ipdata() proxies = ipdata.get_proxy(msg) trynum += 1 return get_response(url, proxies, ipdata, msg, trynum=trynum) except Exception as e: print(f'爬取页面错误 <{url}>: <{e}>') ipdata.update_ipdata(msg) # 删除当前ip msg = ipdata.get_ipdata() proxies = ipdata.get_proxy(msg) trynum += 1 return get_response(url, proxies, ipdata, msg, trynum=trynum)
def get_supplier_info(childresponse): # 供应商详情页获取信息 childselector = etree.HTML(childresponse.text, etree.HTMLParser()) try: tr_eleselector = childselector.xpath('/html/body/div[2]/table/tbody/tr') def get_dict(tr): # 针对详细页面的一行,获取相应信息 name = tr.xpath('th') if name != []: for i in range(len(name)): name = name[i].xpath('text()')[0].strip() value = tr.xpath('td/text()')[i].strip() return {name: value} else: return None supplier_info = list(map(get_dict, tr_eleselector)) return supplier_info except Exception as e: fail_time = datetime.datetime.now() fail_time = trans_date_str(fail_time) with open('failed_url.txt', 'a', encoding='utf-8')as f: f.write(f'{fail_time}:<{childresponse.request.url}>: 该网站解析xpath: <{e}>\n') return None
def get_content(url, bid_id, proj_id): # 根据url获取正文 ipdata = IpData() msg = ipdata.get_ipdata() proxies = ipdata.get_proxy(msg) create_time = datetime.datetime.now() create_time = trans_date_str(create_time) pattern = re.compile('(style.*/style)', re.I) def deal_response_content(response): def deal_table(selector): # 获取概要 info_addr = "ABSTRACT" table_list = selector.xpath('//div[@class="table"]/table/tr/td') if len(table_list) > 0: # 获取到了指定元素 title_list = selector.xpath( '//div[@class="table"]/table/tr/td[contains(@class, "title")]' ) titles = [] values = [] for ele in table_list: if ele in title_list: titles.append(ele.xpath('text()')[0]) else: if len(titles) > len(values): value = ele.xpath('text()') if value == []: values.append('') else: values.append(value[0]) else: pass href_list = selector.xpath( '//div[@class="table"]/table/tr/td/a[contains(@title, "点击下载")]' ) file_url_list = [] if len(href_list) > 0: for href_ele in href_list: url_suffix = href_ele.xpath('@id')[0] file_url = "/oss/download?uuid={}".format( url_suffix) # 目前前缀应该为 http://www.ccgp.gov.cn file_url_list.append(file_url) for i in range(1, len(file_url_list) + 1): values[-i] = file_url_list[-i] table = dict(zip(titles, values)) biuld_bid_info_data = _format_bid_info(table, bid_id, info_addr, create_time) file_attach_ele = selector.xpath( '//a[@class="bizDownload"]') # 概要中的附件 files = [] if len(file_attach_ele) > 0: for file in file_attach_ele: file_name = file.xpath('text()')[0].strip() file_url = file.xpath('@id')[0] file_id = get_uuid() files.append( (file_name, file_url, file_id, bid_id, proj_id)) return biuld_bid_info_data, files def deal_content(selector): # 获取正文 content_list = selector.xpath( '//div[@class="vF_detail_content"]//text()') content_value = '' if len(content_list) > 0: # 获取到了指定元素 for content in content_list: content = content.replace('***', '\r\n').replace(' ', ' ') if content.strip().startswith('<'): pass else: content_value += content file_attach_ele = selector.xpath( "//a[contains(@ignore, '1')]") # 正文中的附件 files = [] if len(file_attach_ele) > 0: for file in file_attach_ele: file_name = file.xpath('text()') if len(file_name) > 0: file_name = file_name[0] file_url = file.xpath('@href')[0] if file_url == 'javascript:;': continue if file_url == '': file_url = file.xpath('@id')[0] file_id = get_uuid() files.append( (file_name, file_url, file_id, bid_id, proj_id)) origin_bid_text_data = (0, bid_id, content_value, 1, 0, create_time, operator) return origin_bid_text_data, files response_text = response.text.encode(web_encoding).decode("utf-8") response_text = response_text.replace('</p>', '***</p>').replace( '</h>', '***</h>').replace('<br>', '***<br>') style_value = pattern.findall(response_text) if len(style_value) > 0: for style in style_value: response_text = response_text.replace(style, '') baseSelector = etree.HTML(response_text, etree.HTMLParser()) bid_info_data_table, file_table = deal_table(baseSelector) # 获取概要 bid_text_data, file_text = deal_content(baseSelector) # 获取正文 file_table.extend(file_text) return bid_info_data_table, bid_text_data, file_table # print('当前url为:', url) response = get_response(url, proxies, ipdata, msg) if response: return deal_response_content(response)
def deal_list_page(response): create_time = datetime.datetime.now() create_time = trans_date_str(create_time) info_addr = 'LIST' selector = etree.HTML(response.text, etree.HTMLParser()) url_list_href = selector.xpath( '//ul[@class="vT-srch-result-list-bid"]/li/a') ori_list_href = selector.xpath( '//ul[@class="vT-srch-result-list-bid"]/li/span') proj_bid_list = [] origin_bid_data = [] # 一页一页地存储 origin_bid_info_data = [] origin_text_data_list = [] file_attach_list = [] for i in range(len(url_list_href)): url_href = url_list_href[i] ori_href = ori_list_href[i] url = url_href.xpath("@href")[0] # 公告链接 bid_title = url_href.xpath("text()")[0].strip() # 公告名称 proj_id = get_uuid() # 获取项目id. # 此处还要存一个表。项目id,项目标题 bid_id = get_uuid() proj_bid_list.append((proj_id, bid_id, create_time, operator)) ori_info = ori_href.xpath('text()') release_time = ori_info[0].strip().split('|')[0].strip() # 发布时间 purchasing_agent = ori_info[0].strip().split('|')[1].strip().split( ':')[1] # 采购人 agency = ori_info[0].strip().split('|')[2].strip().split(':')[ 1] # 代理机构 strong_info = ori_href.xpath('strong//text()') bid_type = strong_info[0].strip().split('|')[0] # 标书类型 project_type = strong_info[1].strip().split('|')[0] # 项目类型 region = ori_href.xpath('a//text()') # 地区 if len(region) != 0: region = region[0] origin_bid_data.append((0, bid_id, bid_title, bid_type, source, url, create_time, operator)) bid_info_dict = { 'release_time': release_time, 'purchasing_agent': purchasing_agent, 'agency': agency, 'bid_type': bid_type, 'project_type': project_type, 'region': region } origin_bid_info_data_list = _format_bid_info( bid_info_dict, bid_id, info_addr, create_time) origin_bid_info_data_table, origin_bid_text_data, files_attach = get_content( url, bid_id, proj_id) origin_bid_info_data.extend(origin_bid_info_data_list) origin_bid_info_data.extend(origin_bid_info_data_table) origin_text_data_list.append(origin_bid_text_data) file_attach_list.extend(files_attach) proj_bid_data = _format_r_projbid(proj_bid_list) rprojbid = RProjBid() # r_proj_bid rprojbid.insertmany(proj_bid_data) origin_bid_info = OriginBidInfo() # t_origin_bid_info origin_bid_info.insertmany(tuple(origin_bid_info_data)) origin_bid_text = OriginBidText() # t_origin_bid_text origin_bid_text.insertmany(tuple(origin_text_data_list)) origin_bid = OriginBid() # t_origin_bid print(len(origin_bid_data), '当前页面获取到的文章数') origin_bid.insertmany(tuple(origin_bid_data)) # 向 t_origin_bid添加数据 file_attach_data = _format_file_attach(file_attach_list, create_time, operator) if len(file_attach_data) > 0: tfileattach = TFileAttach() # t_file_attach tfileattach.insertmany(file_attach_data)
def get_agency(self): def get_agency_info(childresponse): # 供应商详情页获取信息 childselector = etree.HTML(childresponse.text, etree.HTMLParser()) tr_eleselector = childselector.xpath('/html/body/div[2]/table[1]/tbody/tr') def get_dict(tr): # 针对详细页面的一行,获取相应信息 name = tr.xpath('th') if name != []: for i in range(len(name)): name = name[i].xpath('text()')[0].strip() value = tr.xpath('td/text()')[i].strip() return {name: value} else: return None if tr_eleselector: supplier_info = list(map(get_dict, tr_eleselector)) return supplier_info else: print(childresponse.request.url, '该代理页面数据提取失败') ipdata = IpData() msg = ipdata.get_ipdata() proxies = ipdata.get_proxy(msg) form_data = { 'pointPageIndexId': '1', 'pageIndex': '1', # 从首页获取最大页数 'pageSize': '10', } # max_page = self.get_max_page(self.agency_url, proxies, form_data, ipdata, msg) headers['referer'] = 'http://www.ccgp-guangdong.gov.cn/organization/queryPerformOrgList.do' # headers['cookie'] = 'Ks8ae9gdPofpF0yrRJi1UrDsaM-hm8uARsgRyaj46O9l8dsmqJyJ!-1509577578' headers['cookie'] = 'hlce-4C6jnLFpch9x2dUya_0eBJR--2owaXh62fo9E2FQRFQfWrf!-1509577578' # for page in range(19, max_page+1): for page in range(28, 114): print('代理第{}页'.format(page)) form_data['pageIndex'] = page # headers['Content-Length'] = str(len(form_data)) response = get_response_post(self.agency_url, headers, proxies, form_data, ipdata, msg, trynum=0) print('列表页:', form_data) if response: # 解析代理商详情地址 selector = etree.HTML(response.text, etree.HTMLParser()) childurls = selector.xpath('//td[@align="center"]/a') unduplicate_childurls = [] for childurl in childurls: childurl = self.base_url + childurl.xpath('@href')[0] if childurl not in unduplicate_childurls: unduplicate_childurls.append(childurl) childresponse = get_response_get(childurl, headers, proxies, ipdata, msg) print('详情页:', childurl) if childresponse: try: supplier_info = get_agency_info(childresponse) supplier_info.append({'url_source': childurl}) with open('government_procurement/guangdong/agency.txt', 'a', encoding='utf-8')as f: f.write(json.dumps(supplier_info, ensure_ascii=False)) f.write(',') except Exception as e: fail_time = datetime.datetime.now() fail_time = trans_date_str(fail_time) with open('failed_url.txt', 'a', encoding='utf-8')as f: f.write(f'{fail_time}:<{childresponse.request.url}>: 该网站获取失败: <{response.status_code}>\n') finally: time.sleep(3) else: print('详情页信息获取失败') time.sleep(3) else: continue else: print('代理商地址解析失败') continue
def get_suppliers(self): def get_supplier_info(childresponse): # 供应商详情页获取信息 childselector = etree.HTML(childresponse.text, etree.HTMLParser()) try: tr_eleselector = childselector.xpath('/html/body/div[2]/table/tbody/tr') def get_dict(tr): # 针对详细页面的一行,获取相应信息 name = tr.xpath('th') if name != []: for i in range(len(name)): name = name[i].xpath('text()')[0].strip() value = tr.xpath('td/text()')[i].strip() return {name: value} else: return None supplier_info = list(map(get_dict, tr_eleselector)) return supplier_info except Exception as e: fail_time = datetime.datetime.now() fail_time = trans_date_str(fail_time) with open('failed_url.txt', 'a', encoding='utf-8')as f: f.write(f'{fail_time}:<{childresponse.request.url}>: 该网站解析xpath: <{e}>\n') return None ipdata = IpData() msg = ipdata.get_ipdata() proxies = ipdata.get_proxy(msg) form_data = { 'pointPageIndexId': '1', 'pageIndex': '1', # 从首页获取最大页数 'pageSize': '10', } max_page = self.get_max_page(self.supplier_url, proxies, form_data, ipdata, msg) headers['referer'] = 'http://www.ccgp-guangdong.gov.cn/organization/querySellerOrgList.do' headers['cookie'] = 'Ks8ae9gdPofpF0yrRJi1UrDsaM-hm8uARsgRyaj46O9l8dsmqJyJ!-1509577578' for page in range(178, max_page+1): # 从1循环到最大页数 print('供应商第{}页'.format(page)) form_data['pageIndex'] = page # headers['Content-Length'] = str(len(form_data)) response = get_response_post(self.supplier_url, headers, proxies, form_data, ipdata, msg, trynum=0) if response: # 解析供应商详情地址 selector = etree.HTML(response.text, etree.HTMLParser()) childurls = selector.xpath('//div[@class="m_m_cont"]//tr/td[3]/a') real_childurls = [] for childurl in childurls: childurl = self.base_url + childurl.xpath('@href')[0] if childurl not in real_childurls: real_childurls.append(childurl) childresponse = get_response_get(childurl, headers, proxies, ipdata, msg) if childresponse: print('详情页:', childurl) try: supplier_info = get_supplier_info(childresponse) supplier_info.append({'url_source': childurl}) with open('government_procurement/guangdong/supplier.txt', 'a', encoding='utf-8')as f: f.write(json.dumps(supplier_info, ensure_ascii=False)) f.write(',') except Exception as e: fail_time = datetime.datetime.now() fail_time = trans_date_str(fail_time) with open('failed_url.txt', 'a', encoding='utf-8')as f: f.write(f'{fail_time}:<{childresponse.request.url}>: 该网站解析xpath: <{e}>\n') else: print('详情页信息获取失败') time.sleep(3) else: continue