def getClassNames(self,dic, html_content, t=''): ''' 找到被替换的字体, :param dic: :param html_content: :return: ''' for itemskey, itemsvalue in dic.items(): tagList = re.findall('<' + itemskey + ' class="(.*?)"></' + itemskey + '>', html_content) svg_url = itemsvalue.get('url', None) svg_classValue = itemsvalue.get('className', None) svgcont = requests.get(svg_url).text # t=time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()) if os.makedirs('./html', exist_ok=True): os.makedirs('./html') svg_name = './html/' + itemskey + str(t) + '.svg' with open(svg_name, 'w') as f: f.write(svgcont) defs = re.findall('<defs>', svgcont) if len(defs) > 0: ''' <?xml version="1.0" encoding="UTF-8" standalone="no"?> <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> <svg xmlns="http://www.w3.org/2000/svg" version="1.1" xmlns:xlink="http://www.w3.org/1999/xlink" width="650px" height="322.0px"> <style>text {font-family:PingFangSC-Regular, Microsoft YaHei, 'Hiragino Sans GB', Helvetica; ;font-size:14px;fill:#666;}</style> <defs><path id="1" d="M0 38 H600"/><path id="2" d="M0 75 H600"/><path id="3" d="M0 102 H600"/><path id="4" d="M0 132 H600"/><path id="5" d="M0 174 H600"/><path id="6" d="M0 202 H600"/><path id="7" d="M0 237 H600"/><path id="8" d="M0 276 H600"/></defs> <text lengthAdjust="spacing"> <textPath xlink:href="#1" textLength="336">健关农街泉银肥宾津化信县淄民工迁孝盐安平光京乌振</textPath> <textPath xlink:href="#2" textLength="308">淮青连康人红甘定公上汕二烟哈鞍胜合无大三主朝</textPath> <textPath xlink:href="#3" textLength="588">衡园蒙常向夏府乐衢层台生头齐杭锡黑心岛苏十治山南海皇金云郑藏绍晋前石福清襄曙庆华鲁站</textPath> <textPath xlink:href="#4" textLength="448">泰旗六太龙惠才陕湾体遵洛富中沙建肃楼机四绵一明徽沈迎家宿远嘉昌谐</textPath> <textPath xlink:href="#5" textLength="532">育辽潍七木温天源友疆博圳幸九宜通号利团浙爱创邢东道梅花德庄兴港汉茂莞文学佛年</textPath> <textPath xlink:href="#6" textLength="574">封昆汾春冈锦乡阳波广湛永谊内吉市古八弄贵廊湖解教祥充感省拥林环威沿风城保开桂肇西村</textPath> <textPath xlink:href="#7" textLength="448">门黄尔珠凰赣军徐北长场放韶厦结和五重义交武香成名隆深设扬凤宁区坊</textPath> <textPath xlink:href="#8" textLength="210">江川济都进业澳岳新河滨临路镇州</textPath> </text></svg> ''' # index 列表[(),(),()] index = re.findall('<path id="(\d+)" d="M0 (\d+) H\d+"/>', svgcont) textPath = re.findall('">(.*?)</textPath>', svgcont) values = [list(textPath[v]) for v in range(len(textPath))] else: # 第一种 ''' <?xml version="1.0" encoding="UTF-8" standalone="no"?> <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> <svg xmlns="http://www.w3.org/2000/svg" version="1.1" xmlns:xlink="http://www.w3.org/1999/xlink" width="650px" height="64.0px"> <style>text {font-family:PingFangSC-Regular, Microsoft YaHei, 'Hiragino Sans GB', Helvetica; ;font-size:14px;fill:#666;}</style> <text x="14 28 42 56 70 84 98 112 126 140 " y="41">4598036127</text> </svg> ''' # 第二种 ''' <?xml version="1.0" encoding="UTF-8" standalone="no"?> <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> <svg xmlns="http://www.w3.org/2000/svg" version="1.1" xmlns:xlink="http://www.w3.org/1999/xlink" width="650px" height="274.0px"> <style>text {font-family:PingFangSC-Regular, Microsoft YaHei, 'Hiragino Sans GB', Helvetica; ;font-size:14px;fill:#666;}</style> <text x="0" y="26">宾吉汾化民西工襄前庆朝开路市感邢福曙新合友泰金家黑衡设隆常迎通沿富太层庄才川贵头园清</text> <text x="0" y="63">谊无团晋佛城七主定辽台四鞍广莞乐嘉教州交公安蒙云湛遵尔关郑向源弄昌成三夏华黄徽苏梅济</text> <text x="0" y="98">银惠锡泉港军康东滨山保宁鲁藏廊八肃年学重五宿皇宜结京茂楼名大人南岛乌都生圳淮区青沈文</text> <text x="0" y="132">幸昆治津爱长内淄珠春甘凤澳远坊旗古香武中浙湾省林十六桂站深赣杭迁洛心兴业祥德孝烟红湖</text> <text x="0" y="156">威创河齐一厦花博利场号健扬村韶九放阳石镇门永临明冈汕振府谐拥波解县建陕盐光哈上封农汉</text> <text x="0" y="192">胜疆龙肇温二风木义肥江北衢进连绵徐信育沙和平秦街天绍机充锦环岳凰乡潍体海道</text> </svg> ''' # [(),()] posxydata = re.findall('<text x="(.*?)" y="(.*?)">(.*?)</text>', svgcont) if len(posxydata) > 1: if posxydata[0][0] == posxydata[1][0]: # index = re.findall('<text x="(\d+)" y="(\d+)">', svgcont) textPath = re.findall('">(.*?)</text>', svgcont) values = [list(textPath[v]) for v in range(len(textPath))] else: xs = posxydata[0][0].strip().split(' ') index = [(int(i), float(posxydata[0][1])) for i in xs] values = [list(posxydata[0][2]) for i in xs] for tag in tagList: pos = itemsvalue.get(svg_classValue, {}).get(tag) posxy = (float(pos[0]), float(pos[1])) fontvalue = self.getvalue(posxy, index, values) html_content = html_content.replace('<' + itemskey + ' class="' + tag + '"></' + itemskey + '>', fontvalue) # t=time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()) selector = parsel.Selector(html_content) shopname = selector.css('.shop-name::text').get() name = './html/替换之后的-' + shopname + str(t) + '.html' with open(name, 'w', encoding='utf-8') as f: f.write(html_content) return html_content
def login(username, password, domain): session = requests.Session() response = session.get(domain) return parsel.Selector(text=response.text)
import parsel from selenium import webdriver driver = webdriver.Chrome('chromedriver.exe') # 创建浏览器对象 driver.get('http://datanews.caixin.com/interactive/2020/us-president-election/' ) # 使用对象打开网址 data = driver.page_source # 获得网页渲染后的代码 html = parsel.Selector(data) # 吧渲染好的源代码转换成树对象 MuBiao = html.xpath( '//div[@class="us-detail homepage"]/div[@class="chart"]/svg/g')[ -1] # 使用xpath语句,获得最后的一个g标签(返回的是列表,所以可以[-1] ZhiChi_list = MuBiao.xpath('./rect/@value') # 对最后一个g标签内部的值进行处理 # print(zc) for i in ZhiChi_list: #print(i.get()) # 为了检测是不是想要获取的数据 with open('民调.csv', 'a', encoding='utf8') as f: # 保存文件到本地 f.write(i.get()) f.write('\n')
def menu_search_download(url, num, text, window, flag): # 菜单方式下载,搜索方式下载 global photos_url for page in range(1, num + 1): # https://www.fabiaoqing.com/bqb/lists/type/doutu/page/2.html URL = str(url).split('.html')[0] + '/page/{}.html'.format( page) # 每一个类型翻页 proxy = {'HTTP': random.choice(proxies)} response = requests.get(URL, proxies=proxy) time.sleep(random.random()) if flag == 1: photos_url = parsel.Selector(response.text).xpath( '//div[@class="right floated left aligned twelve wide column"]/a/@href' ).extract() # 菜单套图链接 elif flag == 0: photos_url = parsel.Selector(response.text).xpath( '//div[@class="ui segment imghover"]/a/@href').extract( ) # 搜索套图链接 for photo_url in photos_url: # https://www.fabiaoqing.com/bqb/detail/id/9825.html URL1 = 'https://www.fabiaoqing.com' + photo_url # 拼接成完整的套图链接 print(URL1) response1 = requests.get(URL1, proxies=proxy) time.sleep(random.random()) image_urls = parsel.Selector(response1.text).xpath( '//div[@class="swiper-slide swiper-slide-active bqpp"]/a/@href' ).extract() # 拿到每个图的链接 image_name = parsel.Selector(response1.text).xpath( '//div[@class="ui segment imghover"]/h1/text()').extract()[ 0] # 套图名称 for image_url in image_urls: # https://www.fabiaoqing.com/biaoqing/detail/id/149344.html URL2 = 'https://www.fabiaoqing.com' + image_url response2 = requests.get(URL2, proxies=proxy) time.sleep(random.random()) images_info = parsel.Selector(response2.text).xpath( '//div[@class="swiper-slide swiper-slide-active"]/img' ).extract() # 每张图片的信息list for image_info in images_info: soup = BeautifulSoup(image_info, 'html.parser') name1 = str(soup.img.attrs['title']) image = requests.get(soup.img.attrs['src'], proxies=proxy) NAME = name1.split('-')[0].strip().replace( '/', '').replace('\\', '').replace(':', '').replace( ':', '').replace('"', '').replace('*', '').replace( '?', '').replace('?', '').replace('|', '').replace( '<', '').replace('>', '') PATH = path + image_name + '/' if os.path.exists(PATH): pass else: os.makedirs(PATH) with open( PATH + NAME + '.' + str(soup.img.attrs['src']).split('.')[-1], 'wb') as f: f.write(image.content) text.insert( 'end', '正在下载:' + NAME[:20] + '.' + str(soup.img.attrs['src']).split('.')[-1] + '\n') window.update() text.see(tk.END) text.insert(tk.END, '\n*********下载完成!********\n') window.update()
'mldm': '08', 'mlmc': '', 'yjxkdm': '0812', 'zymc': '', 'xxfs': '1', 'pageno': str(first_post_page), } headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36' } response = requests.post(url=url, data=data, headers=headers) html_data = response.text selector = parsel.Selector(html_data) max_page_num = int( selector.css('.zsml-page-box li:nth-last-child(3) a::text').get()) print(max_page_num) for page in range(1, max_page_num + 1): print(page) data = { 'ssdm': '', 'dwmc': '', 'mldm': '08', 'mlmc': '', 'yjxkdm': '0801', 'zymc': '', 'xxfs': '',
def extract_df(in_loteca_htm): """Preprocess the data in the loteca file Returns: A DataFrame with all the rounds present in the Loteca file """ # load file with open(in_loteca_htm, mode='rb') as fp: body = fp.read() body = body.decode('windows-1252') selector = parsel.Selector(body) # core rows = selector.css('tr') header = rows[0] rows = rows[1:] columns = header.css('font::text').extract() rounds = [] for row in rows: tds = row.css('td') td_cnt = len(tds) if td_cnt == 28: # round row data = [td.css('::text').extract_first() for td in tds] rounds.append(data) elif td_cnt == 2: # state row continue else: raise ValueError("Loteca file row with different number of cells") # create DataFrame df = pd.DataFrame.from_records(rounds, columns=columns) # remove columns df = df.drop(['Cidade', 'UF'], axis=1) df = df.drop(['Jogo_%s' % i for i in range(1, 15)], axis=1) # rename columns df.columns = ['roundno', 'date', 'winners14', 'shared14', 'accumulated', 'accumulated14', 'winners13', 'shared13', 'winners12', 'shared12', 'total_revenue', 'prize_estimative'] # convert types df['roundno'] = df.roundno.apply(_read_int) df['date'] = pd.to_datetime(df.date, dayfirst=True) df['winners14'] = df.winners14.apply(_read_int) df['winners13'] = df.winners13.apply(_read_int) df['winners12'] = df.winners12.apply(_read_int) df['shared14'] = df.shared14.apply(_read_float) df['shared13'] = df.shared13.apply(_read_float) df['shared12'] = df.shared12.apply(_read_float) df['accumulated'] = df.accumulated.apply(lambda x: x == 'SIM') df['accumulated14'] = df.accumulated14.apply(_read_float) df['total_revenue'] = df.total_revenue.apply(_read_float) df['prize_estimative'] = df.prize_estimative.apply(_read_float) # set index df = df.set_index('roundno') return df
xindex = int(index[yindex][0]) break #找到具体的位置 value = values[xindex - 1] xyindex = int(posXY[0] / 14) return value[xyindex] html_content = '' with open('dazhong1.html', 'r') as f: html_content = f.read() selector = parsel.Selector(html_content) dic = { 'bb': { 'rt': { 'rt03h': (448.0, 87.0), 'rt0dm': (112.0, 159.0), 'rt0ff': (280.0, 87.0), 'rt0j3': (308.0, 187.0), 'rt0ny': (112.0, 261.0), 'rt119': (252.0, 222.0), 'rt11b': (210.0, 222.0), 'rt17d': (448.0, 159.0), 'rt1ts': (420.0, 222.0), 'rt1z1': (56.0, 23.0), 'rt25z': (434.0, 117.0),
def parse(data): return parsel.Selector(text=data)
import re import parsel from urllib import request url = "https://www.phei.com.cn/gywm/cbsjj/2010-11-19/47.shtml" with request.urlopen(url) as req: text = req.read().decode("utf8") title = re.search("<h1>(.*)</h1>", text).group(1) sel = parsel.Selector(text) content = "\n".join( sel.css(".column_content_inner p font::text").extract()) with open("about.txt", "a") as file: file.write(title) file.write("\n") file.write(content)
# ['Albert Einstein', 'J.K. Rowling', 'Albert Einstein', 'Jane Austen', 'Marilyn Monroe', 'Albert Einstein', 'André Gide', 'Thomas A. Edison', 'Eleanor Roosevelt', 'Steve Martin'] print(res.headers["Content-Type"]) # text/html; charset=utf-8 """ terminal python3 """ import requests import parsel # importa a pagina spider_quote from spider_quote import fetch_content # site alvo page_content = fetch_content("https://quotes.toscrape.com/") # seleciona o conteuda da pagina sel = parsel.Selector(page_content) # seleciona a classe css quotes = sel.css("div.quote") # mostra o conteudo obtido print(quotes) # mostra os autores sel.css("div.quote small.author").getall() # pega somente o texto sel.css("div.quote small.author::text").getall() # --------------------------------------------------------------------------- # # - > AULA ao VIVO - 34.3 ----- <--- / FIM --------------------------------- //
proxies_list = [] for page in range(1, 5): print('=============正在获取第{}数据============'.format(page)) base_url = 'https://www.kuaidaili.com/free/inha/{}/'.format(str(page)) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/81.0.4044.113 Safari/537.36' } response = requests.get(base_url, headers=headers) # print(response.request.headers) data = response.text # print(data) # 转换数据类型 html_data = parsel.Selector(data) # 数据解析 parse_list = html_data.xpath( '//table[@class="table table-bordered table-striped"]/tbody/tr') # 循环遍历 for tr in parse_list: dict_proxies = {} http_type = tr.xpath('./td[4]/text()').extract_first() # 协议类型 ip_num = tr.xpath('./td[1]/text()').extract_first() # 协议类型 ip_port = tr.xpath('./td[2]/text()').extract_first() # 协议类型 # print(http_type,ip_num,ip_port) # 构建ip字典 dict_proxies[http_type] = ip_num + ':' + ip_port # print(dict_proxies) proxies_list.append(dict_proxies)
import requests import parsel headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36' } chapter_url = 'http://www.shuquge.com/txt/8659/31165742.html' response = requests.get(url=chapter_url, headers=headers) response.encoding = response.apparent_encoding html = response.text selector = parsel.Selector(html) h1 = selector.css('h1::text').getall() print(h1) # 同一个标签的组合选择器之间不要有空格 content = selector.css('div#content.showtxt::text').getall() list_content = [] for c in content: list_content.append(c.strip()) print(c.strip()) print(",".join(list_content))
# print(css_response.text) # print(len(css_response.text)) pattern = re.compile('.(\w+){background:-(\d+\.\d+)px -(\d+\.\d+)px;}') class_map = re.findall(pattern, css_response.text) print(class_map) coord = class_map[0] if coord: coord_name, coord_x, coord_y = coord coord_x, coord_y = float(coord_x), float(coord_y) import parsel print(svg_response.text) svg_data = parsel.Selector(svg_response.text) texts = svg_data.xpath('//text') # 根据类名的位置确定y(在哪一行) print(coord_y) axis = [] # 那已知点的y去svg表中查询位于哪一行 for text in texts: if coord_y <= int(text.attrib.get('y')): axis.append(text.attrib.get('y')) # axis = [text.attrib.get('y') for text in texts if coord_y <= int(text.attrib.get('y'))] print('axis', axis) axis_y = axis[0] print(axis_y)
font_dir = os.path.join(os.path.curdir, "fonts") if not os.path.isdir(font_dir): os.mkdir(font_dir) headers = { 'Referer': "https://maoyan.com/films/1212", 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', } url = 'https://maoyan.com/films/1212' # r = requests.get(url, headers=headers) selector = parsel.Selector(r.text) woff = selector.re_first("url\('(.+?\.woff)'\)") #os.path.basename 作用: # url('//vfile.meituan.net/colorstone/d8b92513098c90cbadf06d2779d686492080.woff') # 提取woff中的d8b92513098c90cbadf06d2779d686492080.woff作为文件名 download_font_path = os.path.join(font_dir, os.path.basename(woff)) if not os.path.isfile(download_font_path): urllib.request.urlretrieve('https:%s' % woff, download_font_path) # 解析当前页面使用的字体文件 font = TTFont(download_font_path) # 字形编码与字符编码的对应关系 hex2u = { font['glyf'][u].coordinates.array.tobytes().hex(): u for u in font.getGlyphOrder()[2:]
def get_detail_params(url, page_source, c): selector = parsel.Selector(page_source) seller_id = selector.xpath( "//form[@id='J_FrmBid']/input[@name='seller_id']/@value").get() photo_url = selector.xpath( "//form[@id='J_FrmBid']/input[@name='photo_url']/@value").get() rootCatId = selector.xpath( "//form[@id='J_FrmBid']/input[@name='rootCatId']/@value").get() allow_quantity = re.findall("\"quantity\":(\d+),", page_source)[0] param = re.findall("id=(\d+).*&skuId=(\d+)", url)[0] buy_param = param[0] + "_" + "1" + "_" + param[1] _tb_token_ = c.get("_tb_token_") skuId = param[1] item_id_num = param[0] item_id = param[0] auction_id = param[0] buy_now = re.findall("\"price\":\"(\d+\.\d+)\",", page_source)[0] current_price = buy_now seller_num_id = selector.xpath("//*[@id=\"dsr-userid\"]/@value").get() data = { 'title': '(unable to decode value)', 'x_id': '', 'seller_id': seller_id, 'seller_nickname': '(unable to decode value)', 'who_pay_ship': '(unable to decode value)', 'photo_url': photo_url, 'region': '(unable to decode value)', 'auto_post': 'false', 'etm': 'post', 'virtual': 'false', 'rootCatId': rootCatId, 'auto_post1': '', 'buyer_from': 'ecity', 'root_refer': '', 'item_url_refer': 'https%3A%2F%2Fs.taobao.com%2F', 'allow_quantity': allow_quantity, 'buy_param': buy_param, 'quantity': '1', '_tb_token_': _tb_token_, 'skuInfo': '(unable to decode value)', 'use_cod': 'false', '_input_charset': 'UTF-8', 'destination': '350100', 'skuId': skuId, 'bankfrom': '', 'from_etao': '', 'item_id_num': item_id_num, 'item_id': item_id, 'auction_id': auction_id, 'seller_rank': '0', 'seller_rate_sum': '0', 'is_orginal': 'no', 'point_price': 'false', 'secure_pay': 'true', 'pay_method': '(unable to decode value)', 'from': 'item_detail', 'buy_now': buy_now, 'current_price': current_price, 'auction_type': 'b', 'seller_num_id': seller_num_id, 'activity': '', 'chargeTypeId': '', } return data, param[0]
def get_suburls(html_content, logger): sel = pr.Selector(html_content) links = sel.xpath('//a[contains(@href, "teampages")]/@href').extract() return ['http://www.dailymail.co.uk' + link for link in links]
# 使用xpath 爬取 链家房子数据 import requests import parsel url = 'https://cs.lianjia.com/ershoufang/pg1/' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36" } res = requests.get(url=url, headers=headers) html_date = res.text selector = parsel.Selector(html_date) lis = selector.xpath('//ul[@class="sellListContent"]/li') for li in lis: title = li.xpath('.//div[@class="title"]/a/text()').get() plase = li.xpath('.//div[@class="positionInfo"]/a/text()').getall() plase = "- ".join(plase) value = li.xpath('.//div[@class="totalPrice"]/span/text()').get() + "W" unitPrice = li.xpath('.//div[@class="unitPrice"]/span/text()').get() houseInfo = li.xpath('.//div[@class="houseInfo"]/text()').get() print(title, plase, value, unitPrice, houseInfo, sep=" | ")
def extract_headlines(html_content, modifier, logger): ''' Returns a dictionary with the key aspects of BBC headlines We have two types - the team pages and the confirmed transfers In the team pages - we have an image which also has a title (could be useful). Each article is separated into "article" blocks and the URLs are non-absolute with a numeric code Could potentially improve timing here as the loops might be excessive ''' sel = pr.Selector(html_content) articles_info = {} if modifier: # Get the article information - write an initial search and then find ANYTHING inside search = '//article[@class = "clearfix faux-block-link lakeside lakeside--auto lakeside--has-media"]' articles = sel.xpath(search) # Need to loop because we want all the info to match up - even if not present for i, article in enumerate(articles): # Step through each article and extract_first to only return value and not list article_title = article.xpath( './/span[@class = "lakeside__title-text"]/text()' ).extract_first() article_link = article.xpath( './/a[@class = "faux-block-link__overlay"]/@href' ).extract_first() article_summary = article.xpath('.//p/text()').extract_first() article_image = article.xpath('.//img/@alt').extract_first() article_date = article.xpath( './/span[@class = "timestamp"]/time/text()').extract_first() if article_summary: article_summary = article_summary.strip() if 'http://' not in article_link and 'https://' not in article_link: article_link = 'http://www.bbc.com' + article_link article_info = { 'article_title': article_title.strip(), 'article_link': article_link, 'article_summary': article_summary, 'article_image': article_image, 'article_date': article_date } articles_info['article_{}'.format(i + 1)] = article_info else: article_titles = sel.xpath('//p/a/text()').extract() article_links = sel.xpath('//p/a/@href').extract() article_summaries = sel.xpath('//p/text()').extract() article_images = '' article_dates = '' # Now combine into dictionaries like above for i, title in enumerate(article_titles): if article_summaries[i]: article_summaries[i] = article_summaries[i].strip() if 'http://' not in article_links[ i] and 'https://' not in article_links[i]: article_links[i] = 'http://www.bbc.com' + article_links[i] article_info = { 'article_title': title.strip(), 'article_link': article_links[i], 'article_summary': article_summaries[i], 'article_image': article_images, 'article_date': article_dates } articles_info['article_{}'.format(i + 1)] = article_info return articles_info
def get_html(fours_url): try: print('正在爬取的连接', fours_url) resp_html = requests.get(fours_url, headers=headers) resp_html.encoding = 'gbk' resp_data = parsel.Selector(resp_html.text) data_list = resp_data.xpath( '//ul[@class="list-box"]/li[@class="list-item"]') for li in data_list: print(fours_url) """请求详情""" # 获取详情连接 store_name_link = 'https:' + li.xpath( './ul[@class="info-wrap"]/li[1]/a/@href').get() # 对详情发起请求 detail_resp = requests.get(headers=headers, url=store_name_link) detail_resp.encoding = 'gbk' # 转化响应 detail_data = parsel.Selector(detail_resp.text) # 4s店名称 store_name = detail_data.xpath( '//div[@id="breadnav"]/p/span[2]/text()').get() # print(store_name) # 座机号 telephone_number = detail_data.xpath( '//div[@id="400set"]/span[@class="dealer-api"]/span/text()' ).get() # print(telephone_number) # 地址 address = detail_data.xpath( '//div[@id="dealerposi"]/div[@class="allagency-cont"]/p/@title' ).get() print(address) if address == '': continue # print(address) # 城市名称 city_name = detail_data.xpath( '//div[@id="breadnav"]/p/a/text()').get() # print('城市',city_name) # 城市id sql2 = "select city_id from national_cities where city_name ='{}'".format( city_name) cursor.execute(sql2) city_id = cursor.fetchone()[0] last_sync_time = datetime.datetime.now().strftime( "%Y-%m-28 23:35:23") # 主营品牌 brand_list = detail_data.xpath( '//div[@class="brandtree"]/div/p[@class="text"]/text()' ).getall() print(brand_list) brand_name = '' for bname in brand_list: brand_name = bname if brand_name == '阿尔法·罗密欧': brand_name = '阿尔法・罗密欧' if brand_name == '阿斯顿·马丁': brand_name = '阿斯顿・马丁' # 品牌id sql2 = "select brand_id from t_car_brand where brand_name='{}'".format( brand_name) cursor.execute(sql2) brand_id = cursor.fetchone()[0] data = (brand_name, brand_id, store_name, telephone_number, address, city_name, city_id, last_sync_time) print(data) # t_mbr_role_storehouse # 存入数据 # 根据 车型id 城市id 判断 车型是否存在 时间就是确保 库里只保留本月的数据 sql = "select * from t_mbr_role_storehouse where brand_name='{}'and brand_id='{}'and store_name ='{}' and telephone_number ='{}' and address='{}'and city_name='{}'".format( data[0], data[1], data[2], data[3], data[4], data[5]) cursor.execute(sql) many = cursor.fetchone() if many: print('此数据表中已存在') else: insert_sql = "insert into t_mbr_role_storehouse(brand_name,brand_id,store_name,telephone_number,address,city_name,city_id,last_sync_time)values (%s,%s,%s,%s,%s,%s,%s,%s)" cursor.execute(insert_sql, data) conn.commit() # 提交数据 print('数据提交完成') except TypeError: pass
conn = pymysql.connect(host="112.126.89.134", user="******", password="******", port=3306, db="jgcproddb",charset="utf8") # conn = pymysql.connect(host="localhost", user="******", password="", port=3306, db="jdbc", charset="utf8") cursor = conn.cursor() # 创建游标 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36', 'Connection': 'close' } url = "https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1%20&brandId" resp = requests.get(url=url,headers=headers) response = parsel.Selector(resp.text) # 品牌id ids = response.xpath('//body/ul/li/@id').getall() base_url = 'https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1%20&brandId' for var in ids: id = var.replace('b', '=') base_url= url + id base_response = requests.get(base_url,headers=headers) base_response.encoding='gbk' base_resp = parsel.Selector(base_response.text) # print(base_resp)
def choice_fun31(ra2, e2, window4): global keyword flag = 0 choice = ra2.get() keyword = e2.get() if choice == 1: PATH = path + keyword + '/' if os.path.exists(PATH): pass else: os.makedirs(PATH) # https://www.fabiaoqing.com/search/search/keyword/%E5%B0%8F%E9%BB%84%E9%B8%AD/type/bq.html url2 = 'https://www.fabiaoqing.com/search/search/keyword/' + keyword + '/type/bq.html' page_num = \ parsel.Selector(request_url(url2).text).xpath('//div[@class="ui pagination menu"]/a/text()').extract()[ -2].strip() tk.Label(window4, text="请输入下载页数:", font=('Arial', 10), width=20).grid(column=2) tk.Label(window4, text="(每页 45 张,共 " + page_num + " 页。)", font=('Arial', 8), width=30).grid(column=2) e1 = tk.Entry(window4, font=('Arial', 10), width=10) e1.grid(column=2) tk.Button(window4, text='确认', font=('Arial', 9), width=6, height=1, command=lambda: check_download1(e1, PATH, window4, text2, flag)).grid(column=2) tk.Label(window4, text=" ", width=8).grid() tk.Label(window4, text="下载情况:").grid() tk.Label(window4, text=" ", width=8).grid() text2 = ScrolledText(window4, font=('微软雅黑', 10), width=53, height=12, fg='blue') text2.grid() elif choice == 2: url3 = 'https://www.fabiaoqing.com/search/search/keyword/' + keyword + '/type/bqb.html' page_num = parsel.Selector(request_url(url3).text).xpath('//div[@class="ui pagination menu"]/a/text()') \ .extract()[-2].strip() tk.Label(window4, text=" ", width=8).grid() tk.Label(window4, text="请输入下载页数:", font=('Arial', 10), width=20).grid() tk.Label(window4, text="(每页 8 套,共 " + page_num + " 页。)", font=('Arial', 8), width=30).grid() e1 = tk.Entry(window4, font=('Arial', 10), width=10) e1.grid() tk.Button(window4, text='确认', font=('Arial', 9), width=6, height=1, command=lambda: check_download2(url3, e1, text3, window4, 0) ).grid() tk.Label(window4, text=" ", width=8).grid() tk.Label(window4, text="下载情况:").grid() tk.Label(window4, text=" ", width=8).grid() text3 = ScrolledText(window4, font=('微软雅黑', 10), width=53, height=12, fg='blue') text3.grid()
async def get_html(url): async with aiohttp.ClientSession() as session: async with session.get(url, headers=headers) as response: html = await response.text() # 可以直接获取bytes resp = parsel.Selector(html) tr_list = resp.xpath( '//table[@class="xl-table-def xl-table-a"]//tr[position()>1]') for tr in tr_list: # ranking = tr.xpath('./td[@class="xl-td-t1"]/text()').get() # # print('排名',ranking) series_name = tr.xpath('./td[2]/a/text()').get() if series_name == '宏光MINI EV': series_name = '宏光MINIEV' # print('车系名称',series_name) """获取车主之家车系销量的连接""" series_link = 'https://xl.16888.com' + tr.xpath( './td[2]/a/@href').get() series_link_list.append(series_link) # print(series_link) """月销量数据""" for url in series_link_list: detail_resp = requests.get(url, headers).text detail_data = parsel.Selector(detail_resp) # print(detail_resp) # 车系 series_name = detail_data.xpath( '//div[@class="xl-level-head clr"]/span/text()').get( ).replace('销量详情', '') if series_name == '宏光MINI EV': series_name = '宏光MINIEV' """获取车系id/厂商id""" cursor.execute( "select series_id,car_series_id,car_series,car_brand_id from t_car_category where category_fullname='{}'" .format(series_name)) id_list = cursor.fetchall() # 车系id series_id = '' # 厂商id car_series_id = '' # 厂商名称 car_series = '' # 品牌id brand_id = '' for id in id_list: series_id = id[0] car_series_id = id[1] car_series = id[2] brand_id = id[3] """获取品牌名称""" cursor.execute( "select brand_name from t_car_brand where brand_id='{}'". format(brand_id)) brand_name = cursor.fetchone()[0] tr_list = detail_data.xpath( '//table[@class="xl-table-def xl-table-a"]/tr[position()>1][position()<24]' ) # print(tr_list) for tr in tr_list: # 时间 sale_time = tr.xpath('./td[1]/text()').get() # print(sale_time) # 月销量 monthly_sales = tr.xpath('./td[2]/text()').get() # print(monthly_sales) # 当前销量排行 now_monthly_sales = tr.xpath('./td[3]/a/text()').get() # print(now_monthly_sales) # 占厂商份额 share_of_manufacturers = tr.xpath('./td[4]/text()').get() # print(share_of_manufacturers) # 在厂商排名 ranking_among_manufacturers = tr.xpath( './td[5]/a/text()').get() # print(ranking_among_manufacturers) # 在SUV排名 ranking_in_suv = tr.xpath('./td[6]/a/text()').get() # print(ranking_in_suv) # 更新时间 last_sync_time = datetime.datetime.now().strftime( "%Y-%m-28 23:35:23") data = (series_id, series_name, car_series_id, car_series, brand_id, brand_name, sale_time, monthly_sales, now_monthly_sales, share_of_manufacturers, ranking_among_manufacturers, ranking_in_suv, last_sync_time) data_list.append(data)
import requests import parsel with open('易玩角色扮演.csv', 'a', encoding='utf-8') as f: biaoti = f'{"名字"},{"id"},{"标签"},{"简介"},{"下载地址"}' f.write(biaoti) f.write('\n') for pag in range(1, 101): url = f'http://www.yiwan.com/az/3_0_new_{pag}/' response = requests.get(url=url, verify=False) selector = parsel.Selector(response.text) listli = selector.xpath("//div[@class='r-content softlist']/ul/li") for li in listli: name = li.xpath( "./div[@class='softlist-t']/h3[@class='softlist-t2']/a/text()" ).get() id = li.xpath("./div[@class='softlist-download']/a/@href").re( "game/(.*?)/")[0] xqlj = "http://www.yiwan.com/" + li.xpath( "./div[@class='softlist-download']/a/@href").get() res = requests.get(url=xqlj) sel = parsel.Selector(res.text) downurl = sel.xpath("//div[@class='gi_r']/a/@href").get().strip("\n") if downurl == "javascript:;": downurl = "" tag = li.xpath( "./div[@class='softlist-t']/p[@class='softlist-t4']//a/text()" ).getall() # tag = [] # for tag in tags:
import requests import re from fake_useragent import UserAgent import parsel url='https://tieba.baidu.com/f?kw=%CA%AF%D4%AD%C0%EF%C3%C0&fr=ala0&loc=rec' headers={'UserAgent':UserAgent().chrome} response=requests.get(url,headers).text #print(response) #解析 html=parsel.Selector(response) print(html) title=html.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href').getall() print(title) #拼接 furl='https://tieba.baidu.com' for tit in title: ur=furl+tit #print('当前地址为',ur) #再次请求 response2=requests.get(ur,headers).text f_img=parsel.Selector(response2) #再次解析 img_data=f_img.xpath('//cc/div/img[@class="BDE_Image"]/@src').getall() print(img_data)
# coding = utf-8 import requests import parsel url = 'http://www.win4000.com/zt/yingxionglianmeng.html' response = requests.get(url=url).text html = parsel.Selector(response) img_url_a = html.xpath("//div[@class='tab_box']//a/@href").getall() print(img_url_a) i = 1 for url in img_url_a: response1 = requests.get(url).text html1 = parsel.Selector(response1) jpg_url = html1.xpath("//div[@class='pic-meinv']//img/@src").get() print(jpg_url) jpg = requests.get(jpg_url).content print(i) with open('图片{}.jpg'.format(i), 'wb') as fp: fp.write(jpg) print('保存成功', i) i += 1
def parse_single_league_page(html, league_name): selector = parsel.Selector(html) club_names = selector.xpath('./body/tbody/tr/td/a/text()').extract() return [{'club': club, 'league': league_name} for club in club_names]
import parsel import os headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } for page in range(1, 310): # Total 309pages print(f'======= Scraping data from page {page} =======') url = f'https://www.bikeexif.com/page/{page}' response = requests.get(url, headers=headers) selector = parsel.Selector(response.text) containers = selector.xpath( '//div[@class="container"]/div/article[@class="smallhalf"]') for v in containers: old_title = v.xpath( './/div[2]/h2/a/text()').get() #.replace(':', ' -') if old_title is not None: title = old_title.replace(':', ' -') title_url = v.xpath('.//div[2]/h2/a/@href').get() print(title, title_url) os.makedirs(os.path.join('img', title),
import parsel import requests url = "https://www.138u.cn/" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43' } respond = requests.get(url=url, headers=headers) respond.raise_for_status() respond.encoding = "UTF-8" data = respond.text select1 = parsel.Selector(data) title = select1.xpath('//div[@class="card-deck"]//div/a/img/@alt').getall() select2 = parsel.Selector(data) photo_url = select2.xpath( '//div[@class="card-deck"]//div/a/img/@data-original').getall() for each, i in zip(photo_url, title): MyPhoto = requests.get(url=each, headers=headers, timeout=50).content with open('../辅助爬取/Photoes/' + i + '.jpg', mode="wb") as f: f.write(MyPhoto) print(i + "成功了") print("------finished------")
def extract_headlines(html_content, modifier, logger): ''' Extract headlines from Skysports pages The modifier points to regional articles or not ''' sel = pr.Selector(html_content) articles_info = {} if modifier: # For the regional articles, would like to tag on the source sources = sel.xpath('//div[@class = "paper-stories"]') article_titles = [] for source in sources: source_name = source.xpath('.//p/text()').extract_first() article_titles.extend([ source_name + ' - ' + title for title in source.xpath('.//li//text()').extract() ]) # Skybet means need // for text # Fill in rest with blanks or filler data (links - for title of file) article_links = [ '/fake_link/article_' + str(i) for i in range(0, len(article_titles)) ] article_summaries = [''] * len(article_titles) article_images = [''] * len(article_titles) article_dates = [''] * len(article_titles) else: # Declare an extra condition because sometimes it would link to a generic article that caused bugs extra_condition = './div[@class = "figure span1/3 -spr0-5"]/a/@href = "http://www.skysports.com/transfer-centre"' # Note that there is a "show more" section that cannot load HTML for and three different types of article in general transfer_headlines = sel.xpath( '//div[@class = "box media -vertical -bp20-horizontal"]') transfer_sublines = sel.xpath( '//div[@class = "box media -bp30-vertical" and not({})]'.format( extra_condition)) transfer_sublinks = sel.xpath('//ul[@class = "list -bullet text-s"]') # For main headlines article_titles = transfer_headlines.xpath( './/a[@class = "-a-block -clear"]/h2/text()').extract() article_links = transfer_headlines.xpath( './/a[@class = "-a-block -clear"]/@href').extract() article_summaries = transfer_headlines.xpath( './/a[@class = "-a-block -clear"]/p/text()').extract() article_images = transfer_headlines.xpath('.//img/@data-src').re( '[^\/](\/[^\./]*\.[A-z]*|#)' ) # Found one with a ? in the middle - first [] removes things after :// article_dates = [''] * len(article_titles) # For subheadlines article_titles.extend( transfer_sublines.xpath('.//h2/text()').extract()) article_links.extend( transfer_sublines.xpath('.//a[not(@class)]/@href').extract()) article_summaries.extend( transfer_sublines.xpath('.//a[not(@class)]/p/text()').extract()) article_images.extend( transfer_sublines.xpath('.//img/@alt | .//img/@data-src').extract( )) # Not sure how to extract one and re the other article_dates.extend( transfer_sublines.xpath( './/h5[@class = "caption"]/text()').extract()) # Sublinks in headlines sublink_titles = transfer_sublinks.xpath('./li/a/text()').extract( ) # get the titles to fill in blanks later article_titles.extend(sublink_titles) article_links.extend(transfer_sublinks.xpath('./li/a/@href').extract()) article_summaries.extend([''] * len(sublink_titles)) article_images.extend([''] * len(sublink_titles)) article_dates.extend([''] * len(sublink_titles)) # Now combine into dictionaries for i, title in enumerate(article_titles): if article_summaries[i]: article_summaries[i] = article_summaries[i].strip() if article_links[i] != '' and 'http://' not in article_links[ i] and 'https://' not in article_links[i]: article_links[i] = 'http://www.skysports.com' + article_links[i] article_info = { 'article_title': title.strip(), 'article_link': article_links[i], 'article_summary': article_summaries[i], 'article_image': article_images[i], 'article_date': article_dates[i] } articles_info['article_{}'.format(i + 1)] = article_info return articles_info
""" 108 120 132 相差 12 """ one_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'Referer':'https://www.pearvideo.com/category_31' } response = requests.get(url,headers=one_headers,verify=False).text # print(response) # 转为计算机识别 resp_html = parsel.Selector(response) # 解析数据 获得li_list li_list = resp_html.xpath('//body/li') # print(li_list) for li in li_list: detail_link = 'https://www.pearvideo.com/'+li.xpath('.//a/@href').get() # print(detail_link) # 请求详情页 detail_page_data = requests.get(detail_link,verify=False).text detail_data = parsel.Selector(detail_page_data) contID = detail_link.split('_')[1] video_date = f'https://www.pearvideo.com/videoStatus.jsp?contId={contID}&mrd=0.7890920916276076'