def str_to_htm(s): return etree.HTML(s)
def url_l(a): yifile = a yifile = etree.HTML(yifile) w = "http://sc.chinaz.com" url_list = [] mz = [] #矢量 sl = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos"]/a[1]/text()')[0] su = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos"]/a[1]/@href')[0] su = w + su url_list.append(su) mz.append(sl) # 高清图片 gq = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos"]/a[2]/text()')[0] gu = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos"]/a[2]/@href')[0] gu = w + gu url_list.append(gu) mz.append(gq) # 图标 tb = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos"]/a[3]/text()')[0] tu = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos"]/a[3]/@href')[0] tu = w + tu url_list.append(tu) mz.append(tb) # PSD素材 psd = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos"]/a[4]/text()')[0] pu = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos"]/a[4]/@href')[0] pu = w + pu url_list.append(pu) mz.append(psd) #字体 zt = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos no2"]/a[1]/text()')[0] zu = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos no2"]/a[1]/@href')[0] url_list.append(zu) mz.append(zt) # 英文字体 yw = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos no2"]/a[3]/text()')[0] ywu = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos no2"]/a[3]/@href')[0] url_list.append(ywu) mz.append(yw) # 音效 yx = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos no2"]/a[4]/text()')[0] yxu = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos no2"]/a[4]/@href')[0] yxu = w + yxu url_list.append(yxu) mz.append(yx) # PPT模板 ppt = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos no3"]/a[3]/text()')[0] ppu = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos no3"]/a[3]/@href')[0] ppu = w + ppu url_list.append(ppu) mz.append(ppt) # 简历模板 jl = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos no3"]/a[4]/text()')[0] ju = yifile.xpath( '//div//div[@class="nav"]/ul/li[@class="nos no3"]/a[4]/@href')[0] ju = w + ju url_list.append(ju) mz.append(jl) # print(sl,su,gq,gu,tb,tu,psd,pu,zt,zu,yw,ywu,yx,yxu,ppt,ppu,jl,ju,url_list,mz) return [url_list, mz]
:param bs64_str: 转码格式 :return: 转码后的字符串 ''' font = TTFont(BytesIO(base64.decodestring(bs64_str.encode()))) c = font['cmap'].tables[0].ttFont.tables['cmap'].tables[0].cmap ret_list = [] for char in mystr: decode_num = ord(char) if decode_num in c: num = c[decode_num] num = int(num[-2:]) - 1 ret_list.append(num) else: ret_list.append(char) ret_str_show = '' for num in ret_list: ret_str_show += str(num) return ret_str_show if __name__ == '__main__': get_ip_from_db() html = requests.get(url=rent_url.format('领地OFFICE'), headers=headers, proxies=random.choice(proxies)) bs64_str = re.findall("charset=utf-8;base64,(.*?)'\)", html.content)[0] selector = etree.HTML(html.content) price = selector.xpath('//*[@id="list-content"]/div[13]/div[2]/p/strong/b') res = get_page_show_ret(price[0].text, bs64_str) print res
# import requests from selenium import webdriver from lxml import etree header = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'} url = 'http://irm.cninfo.com.cn/ircs/interaction/viewQuestionForSzse.do?questionId=5651078' # req = requests.get(url,headers=header) # print(req.status_code) # page = req.text driver = webdriver.PhantomJS() driver.get(url) tree = etree.HTML(driver.page_source) x_div = tree.xpath('//div[@class="msgCnt cntcolor"]') for div in x_div: print(div.xpath('./text() | ./div/text()'))
def get_results(self,url): html=httpget(url) ehtml=etree.HTML(html) results=ehtml.xpath("//ul[@class='img']/li") return results
def showurl(page): ht=etree.HTML(page) url=ht.xpath('//ul[@class="searchResultListUl"]//div[@class="searchResultJobinfo fr"]//a[@target="_blank"]/@href') return url
def load_get_html(self,url): if url == None: return # print(url) try: proxies = proxy_pool.proxies () response = requests.get(url=url, headers=self.headers, proxies=proxies).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) else: title = selector.xpath('//div[@class="title"]/h1/text()') if title != []: title = re.sub(r'\r|\n|\s','',title[0]) try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector.xpath('//div[@class="extra"]/text()') if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d+)',''.join(publish_date)).group() else: publish_date = None # print(publish_date) area_name = '江苏-南京' # print(area_name) source = 'http://www.njgp.gov.cn/' table_ele_li = selector.xpath('//div[@class="cont"]/div') content_html = '' for table_ele in table_ele_li[1:4]: content_html += etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '南京市政府采购网' retult_dict['en_name'] = 'Nanjing City Government Procurement' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict)
def parse(self, response): #天猫 if 'tmall' in str(response.meta['url']): content = response.text html = etree.HTML(content) """正则部分""" pattern_model = re.compile(r'型号</th><td> (.*?)</td>') pattern_productname = re.compile(r'>产品名称:(.+)<') pattern_shopname = re.compile( r'data-spm="d4918089"><strong>(.+?)</') pattern_brand = re.compile(r'品牌: (.+?);</') pattern_score = re.compile(r'shopdsr-score-con">(.+?)</') model = re.findall(pattern_model, content)[0] productname = re.findall(pattern_productname, content)[0] shopname = re.findall(pattern_shopname, content)[0] brand = re.findall(pattern_brand, content)[0] score = re.findall(pattern_score, content) """xpath部分""" title = html.xpath('//h1[@data-spm="1000983"]/a/text()')[0] """调用webdriver爬取,并设置代理""" browser = webdriver.Chrome() # options = webdriver.ChromeOptions() # options.add_argument('user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"') # options.add_argument('--proxy-server=http://122.114.31.177:8080') # browser = webdriver.Chrome(chrome_options=options) browser.get(response.meta['url']) #下拉 js = "var q=document.documentElement.scrollTop=10000" browser.execute_script(js) time.sleep(2) html_sele = etree.HTML(browser.page_source) sellcount = html_sele.xpath( '//li[@data-label="月销量"]/div/span[2]/text()')[0] reviewcount = html_sele.xpath( '//li[@class="tm-ind-item tm-ind-reviewCount canClick tm-line3"]/div/span[2]/text()' )[0] newp = html_sele.xpath('//div[@class="tb-detail-hd"]/p/text()')[0] promoprice = html_sele.xpath( '//div[@class="tm-promo-price"]/span/text()')[0] button_comment = browser.find_element_by_xpath( '//ul[@class="tabbar tm-clear"]/li[3]') button_comment.click() time.sleep(2) """下拉后获取到评论小界面提取""" pattern_comment = re.compile(r'title="(.+)分') content = browser.page_source ratescore = re.findall(pattern_comment, content)[0] browser.quit() items = dict() items['model'] = model items['productname'] = productname items['shopname'] = shopname items['brand'] = brand items['score'] = score items['title'] = title items['sellcount'] = sellcount items['reviewcount'] = reviewcount items['newp'] = newp items['promoprice'] = promoprice items['ratescore'] = ratescore items['rank'] = response.meta['rank'] items['url'] = response.meta['url'] for item in items.items(): print(item) else: items = dict() browser = webdriver.Chrome() browser.get(response.meta['url']) time.sleep(2) html = etree.HTML(browser.page_source) browser.quit() model = html.xpath('//ul[@class="attributes-list"]/li[4]/text()') brand = html.xpath('//ul[@class="attributes-list"]/li[3]/text()') shopname = html.xpath( '//div[@class="tb-shop-name"]/dl/dd/strong/a/text()') title = html.xpath('//h3[@class="tb-main-title"]/text()') sellcount = html.xpath( '//div[@class="tb-sell-counter"]/a/strong/text()') reviewcount = html.xpath( '//div[@class="tb-rate-counter"]/a/strong/text()') score = html.xpath('//dd[@class="tb-rate-lower"]/a/text()') promoprice = html.xpath( '//strong[@class="tb-promo-price"]/em[2]/text()') items['model'] = model items['productname'] = response.meta['productname'] items['shopname'] = shopname items['brand'] = brand items['score'] = score items['title'] = title items['sellcount'] = sellcount items['reviewcount'] = reviewcount items['newp'] = '' items['promoprice'] = promoprice items['ratescore'] = '' items['rank'] = response.meta['rank'] items['url'] = response.meta['url'] for item in items.items(): print(item)
def crawling(url): """ Grab cve specific information :param url: string :return xpth_list: list """ xpth_list = [] if url is None or url == "" or url.find("http") == -1: print("crawling, url:", url) return try: content = requests.get(url).content except requests.exceptions.ConnectionError: print('ConnectionError') return [] except requests.exceptions.ChunkedEncodingError: print('ChunkedEncodingError') return [] if content and len(content) > 1: html = etree.HTML(content) try: # if html.xpath( # '/html/body/div[2]/div[2]/div/table/tr/td/div/div[1]/div[3]/div[2]/div[1]' # '/div[2]/span/span/a/text()') == [ # "N/A"] or \ # html.xpath( # '/html/body/div[2]/div[2]/div/table/tr/td/div/div[1]/div[2]/div[2]/div[1]' # '/div[2]/span/span/a/text()') == [ # "N/A"] or \ # html.xpath( # '/html/body/div[2]/div[2]/div[2]/table/tr/td/div/div[1]/div[4]/div[2]/div[1]/div[2]' # '/span/span/a/text()') == ['N/A']: # if html.xpath( # "/html/body/div[2]/div[2]/div/table/tr/td/div/div[1]/div[2]/div[3]/div[1]/div[2]" # "/span/span/a/text()") == [ # "N/A"] or \ # html.xpath( # "/html/body/div[2]/div[2]/div/table/tbody/tr/td/div/div[1]/div[2]/div[2]/div[1]" # "/div[2]/span/span/a/text()") == ["N/A"]: # nvd_score = cve_level = cve_desc = repair_time = vector_value = attack_vector = \ # access_vector = attack_complexity = access_complexity = \ # privilege_required = user_interaction = scope = confidentiality = \ # integrity = availability = authentication = None # print("No data on this vulnerability link, ", url) # score_type = "" # cve_desc = str(html.xpath('//*[@id="vulnDetailTableView"]/tr/td/div/div[1]/p[1]/text()')[0]) # if cve_desc: # score_type = "v3.0" # else: # score_type = "v2.0" # element = html.xpath('//*[@id="nistV2MetricHidden"]/@value') # cve_desc = str(html.xpath('//*[@id="vulnDetailTableView"]/tr/td/div/div[1]/p[1]/text()')[0]) # repair_time = str( # html.xpath('//*[@id="vulnDetailTableView"]/tr/td/div/div[2]/div/span[1]/text()')[0]) # if repair_time is not None: # repair_time = datetime.strptime(repair_time, '%m/%d/%Y') # html1 = etree.HTML(element[0]) # cve_level = str(html1.xpath('//*[@data-testid="vuln-cvssv2-base-score-severity"]/text()') # [0].strip()).capitalize() # nvd_score = str(html1.xpath('//*[@data-testid="vuln-cvssv2-base-score"]/text()')[0].strip()) # vector_value = str(html1.xpath('//*[@data-testid="vuln-cvssv2-vector"]/text()')[0]). \ # replace("(", "").replace(")", "").strip() # access_vector = str(html1.xpath('//*[@data-testid="vuln-cvssv2-av"]/text()')[0].strip()) # access_complexity = str(html1.xpath('//*[@data-testid="vuln-cvssv2-ac"]/text()')[0].strip()) # authentication = str(html1.xpath('//*[@data-testid="vuln-cvssv2-au"]/text()')[0].strip()) # confidentiality = str(html1.xpath('//*[@data-testid="vuln-cvssv3-c"]/text()')[0].strip()) # integrity = str(html1.xpath('//*[@data-testid="vuln-cvssv2-i"]/text()')[0].strip()) # availability = str(html1.xpath('//*[@data-testid="vuln-cvssv2-a"]/text()')[0].strip()) # attack_vector = attack_complexity = privilege_required = user_interaction = scope = None # elif html.xpath( # '/html/body/div[2]/div[2]/div/table/tr/td/div/div[1]/div[3]/div[2]/div[1]/div[2]' # '/span/span/a/text()') == [] and \ # html.xpath( # '/html/body/div[2]/div[2]/div/table/tr/td/div/div[1]/div[2]/div[2]/div[1]' # '/div[2]/span/span/a/text()') == []: # nvd_score = cve_level = cve_desc = repair_time = vector_value = attack_vector = \ # access_vector = attack_complexity = access_complexity = \ # privilege_required = user_interaction = scope = confidentiality = integrity = \ # availability = authentication = None # score_type = "v3.0" # print("This vulnerability link not found, ", url) # else: # score_type = "v3.0" # cve_desc = str(html.xpath('//*[@id="vulnDetailTableView"]/tr/td/div/div[1]/p[1]/text()')[0]) # repair_time = html.xpath('//*[@id="vulnDetailTableView"]/tr/td/div/div[2]/div/span[1]/text()')[0] # if repair_time is not None: # repair_time = datetime.strptime(repair_time, '%m/%d/%Y') # if html.xpath('//*[@id="nistV3MetricHidden"]/@value'): # element = html.xpath('//*[@id="nistV3MetricHidden"]/@value') # else: # element = html.xpath('//*[@id="cnaV3MetricHidden"]/@value') # html1 = etree.HTML(element[0]) # cve_level = str(html1.xpath('//*[@data-testid="vuln-cvssv3-base-score-severity"]/text()')[0] # .strip()).capitalize() # nvd_score = str(html1.xpath('//*[@data-testid="vuln-cvssv3-base-score"]/text()')[0].strip()) # vector_value = str(html1.xpath('//*[@data-testid="vuln-cvssv3-vector"]/text()')[0]).replace("(", ''). \ # replace(')', '').strip() # attack_vector = str(html1.xpath('//*[@data-testid="vuln-cvssv3-av"]/text()')[0].strip()) # attack_complexity = str(html1.xpath('//*[@data-testid="vuln-cvssv3-ac"]/text()')[0].strip()) # privilege_required = str(html1.xpath('//*[@data-testid="vuln-cvssv3-pr"]/text()')[0].strip()) # user_interaction = str(html1.xpath('//*[@data-testid="vuln-cvssv3-ui"]/text()')[0].strip()) # scope = str(html1.xpath('//*[@data-testid="vuln-cvssv3-s"]/text()')[0].strip()) # confidentiality = str(html1.xpath('//*[@data-testid="vuln-cvssv3-c"]/text()')[0].strip()) # integrity = str(html1.xpath('//*[@data-testid="vuln-cvssv3-i"]/text()')[0].strip()) # availability = str(html1.xpath('//*[@data-testid="vuln-cvssv3-a"]/text()')[0].strip()) # access_vector = access_complexity = authentication = None nvd_score = cve_level = cve_desc = repair_time = vector_value = attack_vector = \ access_vector = attack_complexity = access_complexity = \ privilege_required = user_interaction = scope = confidentiality = integrity = \ availability = authentication = None cve_descx = html.xpath( '//*[@id="vulnDetailTableView"]/tr/td/div/div[1]/p[1]/text()') if cve_descx is not None and len(cve_descx) > 0: cve_desc = str(cve_descx[0]) # repair_timex = html.xpath('//*[@id="vulnDetailTableView"]/tr/td/div/div[2]/div/span[1]/text()') repair_timex = html.xpath( '//*[@data-testid="vuln-published-on"]/text()') if repair_timex is not None and len(repair_timex) > 0: repair_time = str(repair_timex[0]) if repair_time is not None and repair_time != "": repair_time = str(datetime.strptime(repair_time, '%m/%d/%Y')) score_type = "v3.0" if html.xpath('//*[@id="nistV3MetricHidden"]/@value'): element = html.xpath('//*[@id="nistV3MetricHidden"]/@value') else: element = html.xpath('//*[@id="cnaV3MetricHidden"]/@value') if element and len(element) > 0: html1 = etree.HTML(element[0]) if html1 is not None: cve_level = str( html1.xpath( '//*[@data-testid="vuln-cvssv3-base-score-severity"]/text()' )[0].strip()).capitalize() nvd_score = str( html1.xpath( '//*[@data-testid="vuln-cvssv3-base-score"]/text()' )[0].strip()) vector_value = str(html1.xpath('//*[@data-testid="vuln-cvssv3-vector"]/text()')[0]).replace("(", ''). \ replace(')', '').strip() attack_vector = str( html1.xpath('//*[@data-testid="vuln-cvssv3-av"]/text()' )[0].strip()) attack_complexity = str( html1.xpath('//*[@data-testid="vuln-cvssv3-ac"]/text()' )[0].strip()) privilege_required = str( html1.xpath('//*[@data-testid="vuln-cvssv3-pr"]/text()' )[0].strip()) user_interaction = str( html1.xpath('//*[@data-testid="vuln-cvssv3-ui"]/text()' )[0].strip()) scope = str( html1.xpath('//*[@data-testid="vuln-cvssv3-s"]/text()') [0].strip()) confidentiality = str( html1.xpath('//*[@data-testid="vuln-cvssv3-c"]/text()') [0].strip()) integrity = str( html1.xpath('//*[@data-testid="vuln-cvssv3-i"]/text()') [0].strip()) availability = str( html1.xpath('//*[@data-testid="vuln-cvssv3-a"]/text()') [0].strip()) access_vector = access_complexity = authentication = None else: element = html.xpath('//*[@id="nistV2MetricHidden"]/@value') if element and len(element) > 0: html1 = etree.HTML(element[0]) if html1 is not None: score_type = "v2.0" cve_level = str( html1.xpath( '//*[@data-testid="vuln-cvssv2-base-score-severity"]/text()' )[0].strip()).capitalize() nvd_score = str( html1.xpath( '//*[@data-testid="vuln-cvssv2-base-score"]/text()' )[0].strip()) vector_value = str(html1.xpath('//*[@data-testid="vuln-cvssv2-vector"]/text()')[0]). \ replace("(", "").replace(")", "").strip() access_vector = str( html1.xpath( '//*[@data-testid="vuln-cvssv2-av"]/text()') [0].strip()) access_complexity = str( html1.xpath( '//*[@data-testid="vuln-cvssv2-ac"]/text()') [0].strip()) authentication = str( html1.xpath( '//*[@data-testid="vuln-cvssv2-au"]/text()') [0].strip()) confidentiality = str( html1.xpath( '//*[@data-testid="vuln-cvssv3-c"]/text()') [0].strip()) integrity = str( html1.xpath( '//*[@data-testid="vuln-cvssv2-i"]/text()') [0].strip()) availability = str( html1.xpath( '//*[@data-testid="vuln-cvssv2-a"]/text()') [0].strip()) attack_vector = attack_complexity = privilege_required = user_interaction = scope = None if cve_desc == 'N/A': cve_desc = None if repair_time == 'N/A': repair_time = None if nvd_score is None or nvd_score == "" or nvd_score == 'N/A': nvd_score = None print( "nvd_score:", nvd_score, "\n", "cve_level:", cve_level, "\n", "repair_time:", repair_time, "\n", "score_type:", score_type, "\n", "vector_value, attack_vector, access_vector,attack_complexity, \n" "access_complexity, privilege_required, user_interaction, scope,\n" "confidentiality, integrity, availability, authentication:\n", vector_value, attack_vector, access_vector, attack_complexity, access_complexity, privilege_required, user_interaction, scope, confidentiality, integrity, availability, authentication, "\n", "cve_desc:", cve_desc) xpth_list = [ nvd_score, cve_level, cve_desc, repair_time, vector_value, attack_vector, access_vector, attack_complexity, access_complexity, privilege_required, user_interaction, scope, confidentiality, integrity, availability, authentication, score_type ] except IndexError as e: print("Subscript out of bounds", e) except UnboundLocalError as e: print("Tag not found", e) return xpth_list
def doctor_line(url):#医生详情 item4 = [] res = download(url) if res == "下载失败": print(res) return res selector = etree.HTML(res.text) name = selector.xpath('//div[@class="detail word-break"]/h1/strong[@class="J_ExpertName"]')[0].text#姓名 item4.append(name) if selector.xpath('//div[@class="detail word-break"]/h1/span'): position = "" for info in selector.xpath('//div[@class="detail word-break"]/h1/span'): position += info.text.lstrip().rstrip()#职位 else: position = None item4.append(position) if selector.xpath('//div[@class="detail word-break"]/div[@id="card-hospital"]/p'): hospitals = "" for info in selector.xpath('//div[@class="detail word-break"]/div[@id="card-hospital"]/p'): hospital = "" for info in info.xpath('a|span'): hospital += info.text.lstrip().rstrip() hospitals += hospital + "/ " else: hospitals = None item4.append(hospitals.rstrip(" / ")) if selector.xpath('//div[@class="detail word-break"]/div[@class="keys"]/a'): keys = "" for key in selector.xpath('//div[@class="keys"]/a'): keys += key.text.lstrip().rstrip() + " / "#关键字 else: keys = None item4.append(keys) if selector.xpath('//div[@class="detail word-break"]/div[@class="goodat"]/a'): goodat = selector.xpath('//div[@class="detail word-break"]/div[@class="goodat"]/a')[0].attrib["data-description"]#擅长 else: goodat = None item4.append(goodat) if selector.xpath('//div[@class="detail word-break"]/div[@class="about"]/a'): about = selector.xpath('//div[@class="detail word-break"]/div[@class="about"]/a')[0].attrib["data-description"]#医生简介 else: about = None item4.append(about) if selector.xpath('//div[@class="status"]/div[@class="data"]//strong'): evaluate = selector.xpath('//div[@class="status"]/div[@class="data"]//strong')[0].text#医生评价 else: evaluate = None item4.append(evaluate) if selector.xpath('//div[@class="status"]/div[@class="data"]//strong'): nr = selector.xpath('//div[@class="status"]/div[@class="data"]//strong')[1].text#预约量 else: nr = None item4.append(nr) if selector.xpath('//div[@class="status"]/div[@class="data"]//strong'): ni = selector.xpath('//div[@class="status"]/div[@class="data"]//strong')[2].text#问诊量 else: ni = None item4.append(ni) if selector.xpath('//div[@class="consult-type"]/ul/li[1]//p[@class="current-price"]'): tit = selector.xpath('//div[@class="consult-type"]/ul/li[1]//p[@class="current-price"]')[0].text#图文问诊价格 else: tit = None item4.append(tit) if selector.xpath('//div[@class="consult-type"]/ul/li[2]//p[@class="current-price"]'): shihua = selector.xpath('//div[@class="consult-type"]/ul/li[2]//p[@class="current-price"]')[0].text#视听问诊价格 else: shihua = None item4.append(shihua) item4.append(url) print("++++++++++++++++++完成一项") return item4
def parse_html(html): s = etree.HTML(html) addr_info = s.xpath('//p[@class="result"]/text()') for n in addr_info: print(n)
def Film(): for a in range(0, 250, 25): url = 'https://movie.douban.com/top250?start={}&filter='.format(a) ua = UserAgent(use_cache_server=False) ip = ['175.148.79.101', '114.220.29.95', '222.190.217.156'] print("开始爬取数据") html = requests.get(url, proxies={'http': random.choice(ip)}, headers={'User-Agent': ua.random}) selector = etree.HTML(html.text) infos = selector.xpath('//ol[1][@class="grid_view"]') for info in infos: number = info.xpath('//div[@class="pic"]/em/text()') name = info.xpath('//span[1][@class="title"]/text()') messages = info.xpath('//div[@class="bd"]/p/text()[2]') # print(messages) a = 0 typelist = [] yearlist = [] peoplelist = [] while a < len(messages): m = messages[a] year = m.split('/')[0] year = re.findall(r"\d+", year) years = ''.join(year) type = m.split('/')[-1] type = (type.split(' ')[1]) type = (type.split('\n')[0]) yearlist.append(years) typelist.append((type)) a += 2 # print(yearlist) # print(typelist) source = info.xpath('//span[@class="rating_num"]/text()') peoples = info.xpath('//div[@class="star"]/span[4]/text()') for i in peoples: people = re.findall(r"\d+", i) people = ''.join(people) peoplelist.append(people) time.sleep(random.randint(1, 3)) list = [] for (i1, i2, i3, i4, i5, i6) in zip(number, name, yearlist, typelist, source, peoplelist): list.append((i1, i2, i3, i4, i5, i6)) for i in list: print(i) SQ = pymysql.connect(host='localhost', port=3306, user='******', passwd='123456', db='doubanfilm', charset='utf8') L = SQ.cursor() L.executemany( "INSERT INTO doubanfilm(number, name, year, type, source, people) VALUES " "(%s,%s,%s,%s,%s,%s)", list) SQ.commit() print('传入数据库完成')
def second_parse(title): ''' 获取热点详细信息 :param title: :return: ''' for it in title: time.sleep(5) redian = {} redian["热点"] = it url = 'https://s.weibo.com/weibo?q=%23{}%23'.format(urllib.parse.quote(it)) pattern = re.compile('[\u4e00-\u9fa5]+(.*?[\u4e00-\u9fa5])') print(url) try: source_code = requests.get(url, headers=headers) if source_code.status_code == 200: # with open('redian.html', 'wb') as fp: # fp.write(source_code.content) tree = etree.HTML(source_code.text) # 爬取该热点阅读量 Reading_volume = tree.xpath('//span/text()') if len(Reading_volume) != 0: Reading_volume = Reading_volume[0] if len(Reading_volume) != 0: Reading_volume = pattern.findall(Reading_volume) if len(Reading_volume) > 0: Reading_volume = Reading_volume[0] # if len(Reading_volume) > 0: # Reading_volume = Reading_volume[0] redian["阅读量"] = Reading_volume # 爬取热点讨论数 # discuss_amount = tree.xpath('//span/text()')[1] # discuss_amount = pattern.findall(discuss_amount)[0] # # if len(discuss_amount) > 0: # # discuss_amount = discuss_amount[0] # redian["讨论数"] = discuss_amount # 记录爬取时间 redian["时间"] = time.asctime(time.localtime(time.time())) # 爬取热点导语内容 content = tree.xpath('//div[@class="card-wrap"]/div/p/text()') if len(content) > 0: content = content[0] + url else: content.append(url) if isinstance(content, list): redian["热点导语"] = content[0] else: redian["热点导语"] = content print(redian) try: with open(os_path, 'a+', encoding="utf-8") as fp: fp.write(str(redian) + '\n') except IOError as err: print("文件写入失败") amount, time_now = parse_readAmount_time(redian['时间'] + ' ' + redian['阅读量']) content, url_c = parse_url_content(redian["热点导语"]) title = it # 执行sql语句 redian['时间'] + ' ' + redian['阅读量'] try: with connection.cursor() as cursor: delSql = "DELETE FROM Hotspot WHERE title = %s" sql = "INSERT INTO Hotspot (title, amount, daytime, content, url, keywords) VALUES (%s, %s, %s, %s, %s, %s)" cursor.execute(delSql, (title)) cursor.execute(sql, (title, amount, time_now, content, url_c, '')) # 没有设置默认自动提交,需要主动提交,以保存所执行的语句 connection.commit() finally: pass except RequestException: print("响应请求失败")
from HtmlRetrival import HtmlRetrival from lxml import etree html_re = HtmlRetrival('http://bbs.qyer.com/thread-2631045-1.html') content = html_re.get_content() tags = { 'title': '//h3[@class="b_tle"]', 'content': '//td[@class="editor bbsDetailContainer"]//*[self::p or self::span or self::h1]' } tr = etree.HTML(content) info = {} f = open('template.txt', 'wb') for tag in tags: info[tag] = [] f.write('\r\n\r\n' + tag + '\r\n\r\n') eles = tr.xpath(tags[tag]) for ele in eles: if ele is None or ele.text is None: continue info[tag].append(ele.text) f.write(ele.text.encode('utf-8') + '\r\n') f.close()
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36" } proxies = { "http": "socks5://127.0.0.1:1080", 'https': 'socks5://127.0.0.1:1080' } base_url = 'https://gelbooru.com/index.php?page=post&s=list&tags=animated&pid={}' for page_num in range(10): try: r_page = requests.get(base_url.format(page_num * 42)) except: pass html = etree.HTML(r_page.content) item_urls = html.xpath( "//div[@class='contain-push']/div[@class='thumbnail-preview']//a/@href" ) for item_url in item_urls: r_item = requests.get("https:" + item_url) item_html = etree.HTML(r_item.content) video_url = item_html.xpath("//video/source/@src") if len(video_url) > 0: video_url = video_url[0] print('downloading: ', page_num, video_url) with open('./public/gelbooru/' + video_url.split('/')[-1], 'wb') as f: try: f.write(requests.get(video_url).content) except Exception as e:
import requests import csv from lxml import etree import json import re headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } r = requests.get('http://www.seputu.com/', headers=headers) html = etree.HTML(r.text) div_mulus = html.xpath('//*[@class="mulu"]') rows = [] for div_mulu in div_mulus: # 标题 div_h2 = div_mulu.xpath('.//div[@class="mulu-title"]/center/h2/text()') if len(div_h2) > 0: # print(h2.string) h2_title = div_h2[0] a_s = div_mulu.xpath('./div[@class="box"]/ul/li/a') # list = [] # 获取章节内容与url地址 for a in a_s: # print(a) href = a.xpath('./@href')[0] box_title = a.xpath('./@title')[0] pattern = re.compile(r'\s*\[(.*)\]\s+(.*)')
import requests from lxml import etree import re import os def getpic(url): name=mmurl.split('/')[-1] print(name) picdata=requests.get(url) with open('D:\\mzsock\\'+name,'wb') as f: f.write(picdata.content) r=requests.get("http://mzsock.com/mv/") r.encoding='utf-8' r=r.text ehtml=etree.HTML(r) nurl=ehtml.xpath('//*/li/div/a[@class="img"]/@href') for mz in nurl: mm=requests.get(mz) mm.encoding='utf-8' mm=mm.text mhtml=etree.HTML(mm) murl=mhtml.xpath('//*/a[@class="image_cx_cont"]/img/@src') for mmurl in murl: getpic(mmurl)
from lxml import html from lxml import etree from collections import OrderedDict import requests from ezraLibrary import textOfDiv link = "https://www.facebook.com/careers/jobs/a0I1200000JY01QEAT/" response = requests.get(link) #get page data from server, block redirects sourceCode = response.content #get string of source code from response htmlElem = html.document_fromstring(sourceCode) #make HTML element object aDict = {} root = etree.HTML(sourceCode) #root.findall(".//table") aDict = {} for e1 in root.iterfind(".//div"): texto = textOfDiv(e1) palabras = texto.split() for palabra in palabras: aDict[palabra] = palabra for item in aDict: print(item.encode('utf-8'))
import requests from lxml import etree html_str=open("index.html",'r',encoding='utf-8').read() # print(html_str) # 使用lxml解析html源代码 html=etree.HTML(html_str) # soup=BeautifulSoup(r.text,"html.parser") # print(html) # <Element html at 0x1db33b459c8> # 想要查看 节点的源代码,使用etree.tostring(节点对象) # print(etree.tostring(html,encoding='utf-8').decode("utf-8")) # xpath('xpath规则') 返回的是列表形式,如果没有查到就是空列表 # nodename 查找某个节点, # print(html.xpath("head")) # 查找html下的head节点 # print(html.xpath("body")) # 查找html下的body节点 # / 放在开头代表从根节点选取 element/element 如果放在节点后面代表节点的下一层节点 # print(html.xpath("/html/head")) # 查找根节点下的html下的head节点 # print(html.xpath("head/title")) #查找head下面的title节点 # print(html.xpath("body/div")) # 取到body下的div,有几个显示几个,bs4只能显示第一个 # print(html.xpath("body/div")[1].xpath('ul/li')) # 取到div下标为1的,再去查找ul下的li # // 搜索,不考虑层级查找标签 # print(html.xpath("//li")) #查找所有的li标签 # print(html.xpath("//li/text()")) #查找所有的li标签下的文本 # print(html.xpath("body/div/ul/li")) # 查找div下的ul下的li标签,如果层级不符合,则不匹配 # print(html.xpath("body/div//li")) # 在满足条件的div中搜索li标签 # . 从当前节点开始查询 # 查找所有的房源的名称和平方数
def page_info(self, jq_url): self.driver.get(jq_url) time.sleep(random.randint(5, 10)) html = etree.HTML(self.driver.page_source) # print(self.sheng_name, self.shi_name, self.jq_name, self.level) address = self.list_to_str(html.xpath('//p[@class="linetext"]/@title')) opentime = '' if html.xpath('//div[@class="hasdown-pre"]/p/text()'): opentime = self.list_to_str( html.xpath('//div[@class="hasdown-pre"]/p/text()')) else: opentime = self.list_to_str( html.xpath('//div[@class="sec-inner"]/dl[2]/dd/p/text()')) yang_dict = self.yangtu( html.xpath('//dl[@class="pic_tab_dl"]/dd/img/@src')) xiang_dict = self.xiangtu( html.xpath('//div[@id="introduction"]/div[2]//img/@to')) div_code = self.div_code(url=jq_url).replace('to=', 'src=') div_code += '景区: {}</br>'.format(self.jq_name) div_code += '地址: {}</br>'.format(address) div_code += '开放时间: {}</br></br>'.format(opentime) # 创建目录 self.addr_path = r'//192.168.100.173/移动库/旅游景区/驴妈妈/{}/{}/{}/'.format( self.sheng_name, self.shi_name, self.jq_name) self.make_dir(addr_path=self.addr_path) print('创建目录成功啦!') car_title = html.xpath( '//div[@class="nchTrafficDerc clearfix"]/div[1]/ul/li/b/text()') car_content = html.xpath( '//div[@class="nchTrafficDerc clearfix"]/div[@class="nchTrafficTab"]' ) traffic_info = '' # print('交通指南:\n') for k, v in zip(car_title, car_content): content = '' for i in v.xpath('./div/p//text()'): content += '{} </br>'.format(i) traffic_info += '{}</br>\n{}</br>\n'.format(k, content) # print(traffic_info) pass div_code += traffic_info if self.insert_MongoDB(jq_name=self.jq_name, yang_dict=yang_dict, xiang_dict=xiang_dict, div_code=div_code): print('插入Mongo成功\n') else: if not os.path.isdir(self.addr_path): shutil.rmtree(self.addr_path) print('目录以移除!') pass
print "正在获取详情页面,url为" #url ="https://item.taobao.com/item.htm?id=538287375253&abtest=10&rn=07abc745561bdfad6f726eb186dd990e&sid=46f938ba6d759f6e420440bf98b6caea" #num_id = re.findall('id=[0-9]+&',url)[0].replace('id=','').replace('&','') #url = "https://detail.tmall.com/item.htm?id="+str(num_id) print url driver.get(url) driver.implicitly_wait(40) # 设置智能超时时间 html = driver.page_source.encode('utf-8') driver.quit() except Exception, e: print "页面加载失败", e return 0 try: print '正在解析页面' try: selector = etree.HTML( html, parser=etree.HTMLParser(encoding='utf-8')) except Exception, e: print "页面加载失败", e return 0 try: # 此部分用于采集每月销量的数据 context = selector.xpath('//div[@class="tm-indcon"]') xiaoliang_date = u'' for i in range(len(context)): temp_date = etree.tostring( context[i], encoding="utf-8") # .encode('utf-8') re_h = re.compile('</?\w+[^>]*>') # 去除一切html标签 s = re_h.sub('', temp_date) + ',' xiaoliang_date += s list_date += xiaoliang_date + ';' except Exception, e:
def info(url, animes, episodes, headers): r = requests.get(url, headers=headers) content1 = decode(r) tree = etree.HTML(content1) names = tree.xpath('//div[@class="detail con24 clear"]/dl/dd/h1/text()') if (names == []): return name = names[0] quarters = tree.xpath( '//div[@class="detail con24 clear"]/dl/dd/div[@class="d_label"][2]/a/text()' ) if (quarters == []): return quarter = quarters[0] introductions = tree.xpath( '//div[@class="detail con24 clear"]/dl/dd/div[@class="d_label2"][3]/text()' ) if (introductions == []): return introduction = introductions[0] times = tree.xpath( '//div[@class="detail con24 clear"]/dl/dd/div[@class="d_label2"][last()]/text()' ) # 爬取到的列表中的第三个 if (times == []): return time = times[2] covers = tree.xpath( '//div[@class="detail con24 clear"]/dl/dt/img/@src') # 封面的url if (covers == []): return cover = covers[0] a = anime(quarter=quarter, time=time, name=name, cover=cover, introduction=introduction) animes.put(a) e_as = tree.xpath( '//div[@class="time_pic list"]/div[1]/div/div/div/ul/li/a' ) # 第二层div有多个类似,需要从第一层找下来 for e_a in e_as: e_url = e_a.xpath('string(@href)') e_r = requests.get(e_url, headers=headers) content2 = decode(e_r) e_tree = etree.HTML(content2) num = e_a.xpath('string(./em/span/text())') e_name = e_a.xpath('string(./em/text())') e_srcs = e_tree.xpath( '//div[@class="container clear"]/div[@class="clear"]/div[@class="player_main"]/iframe/@src' ) e_src = '' if e_srcs != []: e_src = e_srcs[0] else: try: e_src = dynamic(e_url) except: print("Something wrong with dynamic!!\n") print("{} 第{}集没链接!\r\n".format(a.name, num)) e = episode(num, e_name, e_src, a) episodes.put(e) r.close()
connection.close() print('向MySQL中添加数据成功!') except TypeError : pass if __name__ == '__main__': options = webdriver.ChromeOptions() options.add_argument("--no-sandbox") driver = webdriver.Chrome("/usr/bin/chromedriver", chrome_options=options) for url_str in Python_sel_Mysql(): html = call_page(url_str) time.sleep(3) big_list = [] selector = etree.HTML(html) jobs = selector.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/@title") salary = selector.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/strong/text()") firms = selector.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/@title') big_list print(big_list) # insertDB(content) print(datetime.datetime.now())
def get_order_detail_data(driver, detail_url, address_list, good_order_dict): """ 获取商品详情页数据:商品时间,收货人姓名,手机号,收货地址,商品名称,商品金额,商品数量 :param driver: :param detail_order_list: :param status_list: :return: """ data_time_list = [] driver.get("https:" + detail_url) refresh_page_xpath(driver, "//span[contains(text(),'您的位置')]") html_str = driver.page_source.encode("utf-8").decode() html = etree.HTML(html_str) baobei_ele = html.xpath("//th[contains(text(),'宝贝')]") shangpin_ele = html.xpath("//dd[contains(text(),'商品')]") if len(baobei_ele) == 0 and len(shangpin_ele) == 0: good_order_dict["transaction_time"] = "0000-00-00 00:00:00" good_order_dict["payment_time"] = "0000-00-00 00:00:00" good_order_dict["confirmation_time"] = "0000-00-00 00:00:00" good_order_dict["receiver_name"] = "" good_order_dict["receiver_phone"] = "" good_order_dict["receiver_address"] = "" good_order_dict["products"] = "" return good_order_dict times_list = re.findall(r"<span.*?>(\d+-\d+-\d+ \d+:\d+:\d+)</span>?", html_str) times_list = set(times_list) for i in times_list: data_time = datetime.datetime.strptime(i, "%Y-%m-%d %H:%M:%S") data_time_list.append(data_time) data_time_list.sort() if len(data_time_list) >= 3: transaction_time = str(data_time_list[0]) payment_time = str(data_time_list[1]) confirmation_time = str(data_time_list[-1]) else: transaction_time = str(data_time_list[-1]) payment_time = "0000-00-00 00:00:00" confirmation_time = "0000-00-00 00:00:00" good_order_dict["transaction_time"] = transaction_time good_order_dict["payment_time"] = payment_time good_order_dict["confirmation_time"] = confirmation_time # 获取收货地址列表 for addr_dict in address_list: receiver_name = addr_dict["receiver_name"] addr_list = html.xpath( "//span[contains(text(),'{}')]/text()".format(receiver_name)) if len(addr_list) == 0: addr_list = html.xpath( "//dd[contains(text(),'{}')]/text()".format(receiver_name)) if len(addr_list) == 0: addr_list = html.xpath( "//td[contains(text(),'{}')]/text()".format(receiver_name)) if len(addr_list) > 0: break try: addr_list = addr_list[0] except Exception as e: addr_list = "" receiver_list = split_receiver_data(addr_list) try: receiver_name = receiver_list[0] except Exception as e: receiver_name = "" try: receiver_phone = receiver_list[1] except Exception as e: receiver_phone = "" try: receiver_address = receiver_list[2] except Exception as e: receiver_address = "" good_order_dict["receiver_name"] = receiver_name good_order_dict["receiver_phone"] = receiver_phone good_order_dict["receiver_address"] = receiver_address goods_list = get_good_data(driver, html) good_order_dict["products"] = goods_list return good_order_dict
def we(j): filename = "素材.csv" t = 0 for m in range(len(url_l(w())[0])): t += 1 s = "html/" + url_l(w())[1][m] + ".html" # print(s) # with open(s, "r", encoding="utf-8")as file: # mfile = file.read() sj = random.randint(1, 3) time.sleep(sj) urll = url_l(w())[0][m] mresponse = requests.get(urll, headers=sj_User_Agent()) mresponse.encoding = "utf-8" mfile = etree.HTML(mresponse.text) def hh(m): if m + 1 == 1: # 矢量 sl_url = mfile.xpath( '//div//div[@class="text_left"]//div[@class="box picblock col3"]/div/a/img/@src2' ) # for m in sl_url: # return m return sl_url elif m + 1 == 2: # 高清图片 tp_url = mfile.xpath( '//div[@id="container"]/div[@class="box picblock col3"]/div/a/img/@src2' ) # for m in tp_url: # return m return tp_url elif m + 1 == 3: # 图标 tb_url = mfile.xpath( '//ul[@class="pngblock imgload"]/li/p/a/img/@src2') # for m in tb_url: # return m return tb_url elif m + 1 == 4: # PSD素材 psd_url = mfile.xpath( '//div[@class="box col3 ws_block"]/a/img/@src') # for m in psd_url: # return m return psd_url elif m + 1 == 5: # 字体 zt_url = mfile.xpath( '//div//div[@class="index_font_list clearfix"]//li[@class="font"]/div/a/img/@src' ) # for m in zt_url: # return m return zt_url elif m + 1 == 6: # 英文字体 ywzt_url = mfile.xpath('//li[@class="font"]/div/a/img/@src') # for m in ywzt_url: # return m return ywzt_url elif m + 1 == 7: # 音效 yx_url = mfile.xpath( '//div[@class="music_block"]/p[@class="n1"]/@thumb') # for m in yx_url: # return m return yx_url elif m + 1 == 8: # PPT模板 ppt_url = mfile.xpath( '//div[@class="sc_warp mt20"]/div[@id="main"]/div/div/a/img/@src' ) # for m in ppt_url: # return m return ppt_url elif m + 1 == 9: # 简历模板 jl_url = mfile.xpath( '//div[@class="sc_warp mt20"]/div[@id="main"]/div/div/a/img/@src' ) # for m in jl_url: # return m return jl_url else: pass dic = { '矢量': hh(m), '高清图片': hh(m), '图标': hh(m), 'PSD素材': hh(m), '字体': hh(m), '英文字体': hh(m), '音效': hh(m), 'PPT模板': hh(m), '简历模板': hh(m) } # print(dic) list.append(dic) bccsv(filename, dic) hh = ",".join(hh(m)) print(hh, type(hh)) sql = "insert into sc (矢量,高清图片,图标,PSD素材,字体,英文字体,音效,PPT模板,简历模板) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s')" % ( hh, hh, hh, hh, hh, hh, str(hh), hh, hh) cursor.execute(sql) with open("素材.json", "w", encoding="utf-8") as json_file: json_file = json.dump({ "total": t * j, "data": list }, json_file, ensure_ascii=False, indent=4) print("爬取%s条" % (t))
import requests import csv from lxml import etree moviedata = [] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36 OPR/49.0.2725.47' } for i in range(0,250,25): url='https://movie.douban.com/top250?start='+str(i) response = requests.get(url, headers=headers) html = etree.HTML(response.text) movies = html.xpath('//*[@id="content"]/div/div[1]/ol/li') for movie in movies: #排名 count=movie.xpath('./div/div[1]/em/text()')[0] #名称 title=movie.xpath('./div/div[2]/div[1]/a/span[1]/text()')[0] #导演和演员 director_actor=movie.xpath('./div/div[2]/div[2]/p[1]/text()')[0].strip('\n').strip('\xa0') #导演 director=director_actor[:director_actor.find('主演')].strip() #演员 actor = director_actor[director_actor.find('主演'):].strip() #类型地区年份 type_region_year=movie.xpath('./div/div[2]/div[2]/p[1]/text()')[1].strip('\n').strip('\xa0') #年份 year=type_region_year.split('/')[0].strip() #地区 region = type_region_year.split('/')[1].strip() #类型 type = type_region_year.split('/')[2].strip()
def get_url_info(url_list, f_data): global dict_data # 读取字典中的数据 f_data.seek(0, 0) content = f_data.read() if content: dict_data = json.loads(content) # 获取配置表的id,赋值给结果表 cur.execute("SELECT id FROM t_spider_conf WHERE domain = %s", spider_url) conf_id = cur.fetchone() conf_id = conf_id[0] # 新闻数累加器 sum_i = 0 # 获取新闻栏目名 r = requests.get(url_list[0], headers=headers) r.encoding = 'UTF-8' html = etree.HTML(r.text) news_heading = html.xpath('//*[@id="bok_0"]/div[@class="zzj_3"]/text()') news_heading = ''.join(news_heading) # 创建文件夹 # 先判断文件夹是否存在,不存在则创建文件夹 now_dir = os.getcwd() new_dir = now_dir + '/' + news_heading dir_judge = os.path.exists(new_dir) if not dir_judge: os.mkdir(new_dir) # print(new_dir) html_filter, news_url, news_title, news_author, news_time = '', '', '', '', '' # 对每页的每个新闻做处理 for i, url in enumerate(url_list): for j in range(0, i_news): # 将新闻标题+内容整合,保存为字典 # temp_info = {} r = requests.get(url, headers=headers) r.encoding = 'UTF-8' html = etree.HTML(r.text) tips = '正在获取{}栏目下第{}页第{}条新闻,总第{}条新闻......'.format( news_heading, i + 1, j + 1, sum_i + 1) print(tips) try: xpath_temp = '//*[@id="bok_0"]/div[@class="zzj_5"]/div[' + str( 1 + j * 2) + ']/a/' # temp_info['title'] = html.xpath(xpath_temp + 'span/text()')[0] news_title = html.xpath(xpath_temp + 'span/text()')[0] # 新闻的具体url news_url = html.xpath(xpath_temp + '@href') news_url = ''.join(news_url) # print(news_url) # 引入tips, 查找爬虫出错未爬取到的空的新闻内容 # temp_info['content'] = get_url_content(news_url, tips) # print(temp_info) print('新闻标题:{}'.format(news_title)) # 存储栏目新闻的记录,放进字典和数据库,如果已经存在,则不存储 judge = news_url in dict_data.keys() if not judge: dict_data[news_url] = news_title res = requests.get(news_url, headers=headers) res.encoding = 'UTF-8' raw_html = res.text # 对直接跳转的网页做处理 search_refresh = re.search(r'http-equiv="refresh".*\'', raw_html) if search_refresh: # print(search_refresh.group()) refresh_url = re.search(r'[a-zA-z]+://[^\s]*\w', search_refresh.group()) refresh_url = refresh_url.group() # 使requests忽略对SSL的验证和报错, 否则会过度连接 urllib3.disable_warnings( urllib3.exceptions.InsecureRequestWarning) refresh_res = requests.get(refresh_url, headers=headers, verify=False) refresh_res.encoding = 'UTF-8' # print(refresh_res) raw_html = refresh_res.text judge_identifier = not_found_judge(raw_html) # 对非404 not found的网页做进一步处理 if judge_identifier: # print(raw_html) html_filter = sensitive_word_filter(raw_html) # 对图片新闻栏目下新闻中的图片做高清处理 if news_heading == '郑州大学网上新闻(图片新闻)': html_filter = img_update(html_filter) timestamp = round(time.time()) html_file = new_dir + '/' + str( timestamp) + '.html' pdf_file = new_dir + '/' + str(timestamp) + '.pdf' try: pass except pymysql.err.DataError: print("html编码错误或值错误!") html_filter = html_filter.encode( encoding='UTF-8', errors='ignore') finally: time_now = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') cur.execute( insert_result, (conf_id, 'detail', news_url, html_filter, html_file, pdf_file, time_now, news_heading, news_title, news_author, news_time)) conn.commit() json_data = json.dumps(dict_data) f_data.seek(0, 0) f_data.write(json_data) print('该新闻《{}》已爬取。'.format(news_title)) with open(html_file, 'w+', encoding='UTF-8') as f1: f1.write(html_filter) # html转pdf pdfkit.from_url(refresh_url, pdf_file, configuration=confg) # 因跳转到不同网站的xpath不同,获取不到统一的xpath,故news_author, news_time都为空 print('该新闻《{}》pdf格式已转换成功。'.format(news_title)) else: # 将404 not found 记录进数据库 html_filter = '404 not found' time_now = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') cur.execute( insert_result, (conf_id, 'detail', news_url, html_filter, '', '', time_now, news_heading, news_title, news_author, news_time)) conn.commit() json_data = json.dumps(dict_data) f_data.seek(0, 0) f_data.write(json_data) print('该新闻《{}》网页不存在, 以‘404 not found’为网页内容存入数据库。'. format(news_title)) # 对非跳转网页做处理 else: judge_identifier = not_found_judge(raw_html) # 对非404 not found的网页做进一步处理 if judge_identifier: html = etree.HTML(raw_html) news_author = html.xpath( '//*[@id="bok_0"]/div[@class="zzj_4"]/span[1]/text()' ) news_time = html.xpath( '//*[@id="bok_0"]/div[@class="zzj_4"]/span[3]/text()' ) html_filter = sensitive_word_filter(raw_html) # 对图片新闻栏目下新闻中的图片做高清处理 if news_heading == '郑州大学网上新闻(图片新闻)': html_filter = img_update(html_filter) # print(html_filter) timestamp = round(time.time()) html_file = new_dir + '/' + str( timestamp) + '.html' pdf_file = new_dir + '/' + str(timestamp) + '.pdf' try: pass except pymysql.err.DataError: print("html编码错误或值错误!") html_filter = html_filter.encode( encoding='UTF-8', errors='ignore') finally: time_now = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') cur.execute( insert_result, (conf_id, 'detail', news_url, html_filter, html_file, pdf_file, time_now, news_heading, news_title, news_author, news_time)) conn.commit() json_data = json.dumps(dict_data) f_data.seek(0, 0) f_data.write(json_data) print('该新闻《{}》已爬取。'.format(news_title)) # 记录爬取的html原码 with open(html_file, 'w+', encoding='UTF-8') as f1: f1.write(html_filter) # 对html原码中不能正确解析的黑体做调整 err_index = html_filter.find('黑体') if err_index != -1: html_filter = html_filter[: err_index] + '宋体' + html_filter[ err_index + len('黑体'):] # html转pdf pdfkit.from_string(html_filter, pdf_file, configuration=confg) print('该新闻《{}》pdf格式已转换成功。'.format(news_title)) else: # 将404 not found 记录进数据库 html_filter = '404 not found' time_now = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') cur.execute( insert_result, (conf_id, 'detail', news_url, html_filter, '', '', time_now, news_heading, news_title, news_author, news_time)) conn.commit() json_data = json.dumps(dict_data) f_data.seek(0, 0) f_data.write(json_data) print('该新闻《{}》网页不存在, 以‘404 not found’为网页内容存入数据库。'. format(news_title)) else: print('该新闻《{}》已爬取过且保存在数据库中!'.format(news_title)) except IOError: print("Warning: wkhtmltopdf读取文件失败, 可能是网页无法打开或者图片/css样式丢失。") except IndexError: print("该栏目《{}》下的新闻已全部爬取完!".format(news_heading)) break sum_i += 1 time.sleep(sleep_time) # 清空之前的信息 html_filter, news_url, news_title, news_author, news_time = '', '', '', '', '' time.sleep(sleep_time)
def classify(self,url): data = urllib2.urlopen(url) code=data.read() selector=etree.HTML(code) return selector.xpath('//h1[@class="title"]/a/text()')
def get_comments(self, comments_url): resp = urllib.request.urlopen(comments_url) html_data = resp.read().decode('utf-8') # 构建xpath html = etree.HTML(html_data) return html.xpath('//*[@id="comments"]/div[1]/div[2]/p/span/text()')[0]
def get_info_number(start_url): res = get_response(start_url) selector = etree.HTML(res.text) numbers = selector.xpath('//h2[@class="total fl"]/span/text()')[0] return numbers