def Download_Geckodriver(): #Windows #Download and extract geckodriver try: response = requests.get( "https://github.com/mozilla/geckodriver/releases/") tree = html.fromstring(response.content) tree = tree.xpath( '/html/body/div[4]/div/main/div[2]/div/div[3]/div[1]/div/div[2]/div[1]/div/div/a' ) if platform.architecture()[0] == '64bit': Zipname = "geckodriver-" + tree[0].text_content() + "-win64.zip" else: Zipname = "geckodriver-" + tree[0].text_content() + "-win32.zip" response = requests.get( 'https://github.com/mozilla/geckodriver/releases/download/' + tree[0].text_content() + "/" + Zipname, stream=True) if response.status_code == 200: with open(Zipname, 'wb') as out_file: shutil.copyfileobj(response.raw, out_file) del response except: print( "Could not Download and write newest version of Geckodriver ... ") try: if os.path.isfile("geckodriver.exe"): os.remove("geckodriver.exe") zip = zipfile.ZipFile(Zipname) zip.extractall() except: print("Could not be extracted ... ")
def get_image_amount(url): #这里就相当于重复造轮子了,因为基本的代码逻辑跟上一个函数一模一样。想要简单的话就是定义一个元组,然后把获取标题、获取链接、获取图片总数的3组函数的逻辑揉在一起,最后将结果作为元组输出。不过作为新手教程,还是以简单易懂为好吧。想挑战的同学可以试试写元组模式 response = requests.get(url).content selector = html.fromstring(response) image_amount = selector.xpath("//div[@class='page']/a[last()-1]/text()")[0] # a标签的倒数第二个区块就是图片集的最后一页,也是图片总数,所以直接取值就可以 return image_amount
def get_image_title(url): #现在进入到套图的详情页面了,现在要把套图的标题和图片总数提取出来 response = requests.get(url).content selector = html.fromstring(response) image_title = selector.xpath("//h2/text()")[0] #需要注意的是,xpath返回的结果都是序列,所以需要使用[0]进行定位 return image_title
def get_image_detail_website(url): #这里还是重复造轮子。 response = requests.get(url).content selector = html.fromstring(response) image_detail_websites = [] image_amount = selector.xpath("//div[@class='page']/a[last()-1]/text()")[0] #这里重复构造变量,主要是为了获取图片总数。更高级的方法是使用函数间的传值,但是我忘了怎么写了,所以用了个笨办法。欢迎大家修改 #构建图片具体地址的容器 for i in range(int(image_amount)): image_detail_link = '{}/{}'.format(url, i + 1) response = requests.get(image_detail_link).content sel = html.fromstring(response) image_download_link = sel.xpath( "//div[@class='content']/a/img/@src")[0] #这里是单张图片的最终下载地址 image_detail_websites.append(image_download_link) return image_detail_websites
def init_ijdoukun(): all_contents = b'' page_addr = 'https://joyokanji.info/iji.html?' for page_idx in [ "a", "ka", "sa", "ta", "na", "ha", "ma", "ya", "ra", "wa" ]: page = requests.get(page_addr + page_idx) all_contents = all_contents + page.content return html.fromstring(all_contents)
def tmp_enrich_with_reading_table(self, read_tbl): tree = html.fromstring(read_tbl) read_list = tree.xpath('//*[@class="C_read"]') #read_list = tree.cssselect('//[@class="C_read"]') for read_item in read_list: content = lxml.html.tostring(read_item, encoding='unicode') targ = '<td class="C_read">' + self.formated + '</td>' if content == targ: print(self.formated + self.reading)
def download_html(self, url: str): content = self.load_url_content(url) if content is None: page = requests.get(url, allow_redirects=True) content = page.content if page.status_code == 200 else b"" self.save_url_content(url, content) if content is None: return None tree = html.fromstring(content) if content else None return tree
def get_page_number(num): #构建函数,用来查找该页内所有图片集的详细地址。目前一页包含15组套图,所以应该返回包含15个链接的序列。 url = 'http://www.mmjpg.com/home/' + num #构造每个分页的网址 response = requests.get(url).content #调用requests库,获取二进制的相应内容。注意,这里使用.text方法的话,下面的html解析会报错,大家可以试一下。这里涉及到.content和.text的区别了。简单说,如果是处理文字、链接等内容,建议使用.text,处理视频、音频、图片等二进制内容,建议使用.content。 selector = html.fromstring(response) #使用lxml.html模块构建选择器,主要功能是将二进制的服务器相应内容response转化为可读取的元素树(element tree)。lxml中就有etree模块,是构建元素树用的。如果是将html字符串转化为可读取的元素树,就建议使用lxml.html.fromstring,毕竟这几个名字应该能大致说明功能了吧。 urls = [] #准备容器 for i in selector.xpath("//ul/li/a/@href"): #利用xpath定位到所有的套图的详细地址 urls.append(i) #遍历所有地址,添加到容器中 return urls
def ProcessJitenon(self): page = pickle.load(open(self._get_kanji_file_name(), "rb")) tree = html.fromstring(page.content) block = tree.xpath('//*[@id="kanjiright"]/table/tr/th') startIdx = h3_row_nb(block[0]) + h3_row_nb(block[1]) endIdx = startIdx + h3_row_nb(block[2]) + h3_row_nb(block[3]) # issue with kanji.jineton for 平 if self._Kanji == "平": endIdx += 1 block = tree.xpath('//*[@id="kanjiright"]/table/tr/td') for idx in range(startIdx, endIdx): read = block[idx].getchildren()[0].text self._readingList.append(CYomi(read, "△" in block[idx].text))
def get_meta(self): plain_html = self.response_text h = html2text.HTML2Text() h.ignore_links = True plain_text = h.handle(plain_html) text_obj = html.fromstring(plain_html.lower()) try: title = text_obj.xpath('//title/text()')[0].strip() except Exception: title = 'None' try: description = text_obj.xpath( '//meta[@name="description"]/@content')[0].strip() except Exception: description = 'None' return [title, description, plain_text, plain_html]
def CargarUrl(self, url): from lxml import etree, html from urllib import request url_partes = url.split('://', 2) url = '%s://%s' % (url_partes[0], url_partes[-1].replace('//', '/')) req = request.Request(url, headers={'User-Agent': "Magic Browser"}) page_content = request.urlopen(req).read() page_content = re.sub(b'\<\!\[CDATA\[(.+)\]\]\>', br'\1', page_content) page_elem = html.fromstring(page_content) #parser = etree.XMLParser(strip_cdata=False) #page_elem = etree.XML(page_content, parser) return page_elem
def Check_for_MangaUpdate(Url, Old_Release_Text): try: response = requests.get(Url, stream=True) if response.status_code == 200: tree = html.fromstring(response.content) Title = tree.xpath( '/html/body/div[2]/div[2]/div[2]/div[2]/div/div[2]/div[1]/div[1]/span[1]' )[0].text Release = tree.xpath( '//*[@id="main_content"]/div[2]/div[1]/div[3]/div[12]') Release_Text = Release[0].text_content().split(' by')[0] else: return 0, response.status_code, "" if Release_Text == Old_Release_Text: return 0, "", "" else: return 1, Release_Text, Title except Exception as e: print(e + " Could not check " + Url + " for Updates ...")
def Download_Ublockorigin(): #Download Ublockorigin try: response = requests.get("https://github.com/gorhill/uBlock/releases/") tree = html.fromstring(response.content) tree = tree.xpath( '/html/body/div[4]/div/main/div[2]/div/div[3]/div[1]/div/div[2]/div[1]/div/div/a' ) xpiname = 'uBlock0_' + tree[0].text_content() + '.firefox.signed.xpi' response = requests.get( 'https://github.com/gorhill/uBlock/releases/download/' + tree[0].text_content() + "/" + xpiname, stream=True) if response.status_code == 200: with open(xpiname, 'wb') as out_file: shutil.copyfileobj(response.raw, out_file) except: print( "Could not Download and write newest version of Ublockorigin ... ")
async def miiverse(ctx, *url): if not url: return await main.say("```\nmiiverse [post URL, e.g. AYEBAAAEAAB2UZ8mAzTspw]```") if url[0] == 'AYEBAAAEAAB2UZ8mAzTspw': return await main.say("haha that isn't actually a post") await main.send_typing(ctx.message.channel) try: srv = urllib.request.urlopen("https://miiverse.nintendo.net/posts/{0}/embed".format(url[0])) except Exception as e: return await main.say("```\n" + str(e) + "```") ftree = html.fromstring(srv.read().decode()) drawing = ftree.xpath('//*[@id="post-content"]/div/p/img/@src') post = ftree.xpath('//*[@id="post-content"]/div/p/text()') screenshot = ftree.xpath('//*[@id="post-content"]/div/div[1]/img/@src') thing = "" if drawing: thing += "c'est une belle peinture\n\n" + drawing[0] else: thing += post[0] if screenshot: thing += "\n" + screenshot[0] return await main.say(thing)
PERSON_SAMPLE_SIZE = 1000 # initialize array full = [] # get website from config for url in my_config.config_values['name_generator_url']: logger.info('Looking at : %s', url) # repeat for sample size for i in range(0, PERSON_SAMPLE_SIZE): content = {} # sleep random time time.sleep(random.randint(10, 20)) # get the page page_content, page_found = get_page(url) # page_content=requests.get(url) logger.info('page_content type : %s', type(page_content)) tree = html.fromstring(page_content) # address = tree.xpath( '/html/body/div[3]/div[1]/div[7]/div[12]//text()') website = tree.xpath( '/html/body/div[3]/div[1]/div[7]/div[18]//text()') card_number = tree.xpath( '/html/body/div[3]/div[1]/div[7]/div[24]//text()') card_expire = tree.xpath( '/html/body/div[3]/div[1]/div[7]/div[26]//text()') security_code = tree.xpath( '/html/body/div[3]/div[1]/div[7]/div[28]//text()') occupation = tree.xpath( '/html/body/div[3]/div[1]/div[7]/div[30]//text()') company = tree.xpath( '/html/body/div[3]/div[1]/div[7]/div[32]//text()')
import lxml import requests import html response = requests.get('http://packtpub.com/tech/python') tree = html.fromstring(response.content) books = tree.xpath('//div[@class=""price-wrapper]') / text() print(books)
def rscrape(): order = [ 'name', 'type', 'reviews', 'phone', 'address', 'url', 'description', 'profile' ] files = listdir('**removed**') for f in files[:1]: s = requests.Session() s.mount('file://', FileAdapter()) resp = s.get('file:///C:/Users/**removed**/Documents/**removed**/' + f) ## r = requests.get(SITE +f, ## headers={'User-Agent':'Mozilla/5.0'}) tree = html.fromstring(resp.content) rev = [ r[:r.find('\n')] for r in tree.xpath( '//div[@class="l-small-top-space l-small-bottom-space verified-reviews"]/a[@href]/text()' ) ] # indexe of reviews >= 20 indexes = [] for r in range(len(rev)): if int(rev[r]) > 19: indexes.append(r) # dict of contractors contractor = {} contractor['name'] = [ tree.xpath('//span[@itemprop="name"]/text()')[i] for i in indexes ] c_type = tree.xpath('//h1[@class="t-header-secondary"]/text()')[0] contractor['type'] = c_type[:c_type.find( 'Com') - 1] if c_type[:2] == 'Wi' else c_type[:c_type.find('Cont' ) - 1] contractor['reviews'] = [rev[i] for i in indexes] contractor['phone'] = [ tree.xpath('//span[@itemprop="telephone"]/text()')[i] for i in indexes ] contractor['address'] = [ tree.xpath('//span[@itemprop="streetAddress"]/text()')[i] + ', ' + tree.xpath('//span[@itemprop="addressLocality"]/text()')[i] + ', ' + tree.xpath('//span[@itemprop="addressRegion"]/text()')[i] + ' ' + tree.xpath('//span[@itemprop="postalCode"]/text()')[i] for i in indexes ] for link in [ tree.xpath( '//div[@class="l-small-top-space l-small-bottom-space verified-reviews"]/a/@href' )[i] for i in indexes ]: l2 = SITE + link[:link.find('#')] print(l2) req = requests.get(l2, headers={'User-Agent': 'Mozilla/5.0'}) tree2 = html.fromstring(req.text) print(tree2) contractor['url'] = [] contractor['description'] = [] contractor['profile'] = [] try: contractor['url'].append( tree2.xpath('//a[@rel="nofollow"]/text()')) except: contractor['url'].append('') # Add try block if this ever doesn't exist desc = tree2.xpath('//p[@class="t-heavy"]/text()') print(desc) contractor['description'].append(desc[0]) contractor['profile'].append(desc[1]) print(contractor)
__author__ = 'dixon' import lxml import html import requests page = requests.get('http://econpy.pythonanywhere.com/ex/001.html') tree = html.fromstring(page.text) print(tree)
def rscrape(): order = ['name','type','reviews','phone','address','url','description','profile'] files = listdir('**removed**') for f in files[:1]: s = requests.Session() s.mount('file://', FileAdapter()) resp = s.get('file:///C:/Users/**removed**/Documents/**removed**/'+f) ## r = requests.get(SITE +f, ## headers={'User-Agent':'Mozilla/5.0'}) tree = html.fromstring(resp.content) rev = [r[:r.find('\n')] for r in tree.xpath( '//div[@class="l-small-top-space l-small-bottom-space verified-reviews"]/a[@href]/text()')] # indexe of reviews >= 20 indexes = [] for r in range(len(rev)): if int(rev[r]) > 19: indexes.append(r) # dict of contractors contractor = {} contractor['name'] = [tree.xpath('//span[@itemprop="name"]/text()')[i] for i in indexes] c_type = tree.xpath('//h1[@class="t-header-secondary"]/text()')[0] contractor['type'] = c_type[:c_type.find('Com')-1] if c_type[:2]=='Wi' else c_type[:c_type.find('Cont')-1] contractor['reviews'] = [rev[i] for i in indexes] contractor['phone'] = [tree.xpath( '//span[@itemprop="telephone"]/text()')[i] for i in indexes] contractor['address'] = [tree.xpath( '//span[@itemprop="streetAddress"]/text()' )[i] +', '+ tree.xpath( '//span[@itemprop="addressLocality"]/text()')[i] +', '+ tree.xpath('//span[@itemprop="addressRegion"]/text()' )[i] +' '+ tree.xpath( '//span[@itemprop="postalCode"]/text()' )[i] for i in indexes] for link in [tree.xpath( '//div[@class="l-small-top-space l-small-bottom-space verified-reviews"]/a/@href' )[i] for i in indexes]: l2 = SITE + link[:link.find('#')] print(l2) req = requests.get(l2, headers={'User-Agent':'Mozilla/5.0'}) tree2 = html.fromstring(req.text) print(tree2) contractor['url'] = [] contractor['description'] = [] contractor['profile'] = [] try: contractor['url'].append(tree2.xpath('//a[@rel="nofollow"]/text()')) except: contractor['url'].append('') # Add try block if this ever doesn't exist desc = tree2.xpath('//p[@class="t-heavy"]/text()') print(desc) contractor['description'].append(desc[0]) contractor['profile'].append(desc[1]) print (contractor)
def parser_html(html_str): return html.fromstring(html_str)
import html import requests import os os.system("color 0a") print("\n \n") print("Grand Theft Auto V cheat searcher \n") print("\n \n") iNput = raw_input("Search ~>> ") pullPage = requests.get("http://www.cheatcc.com/ps4/grandtheftauto5cheatscodes.html") pushPage = html.fromstring(pullPage.content) cheatName = tree.xpath('//b/text()') cheatCode = tree.xpath('//p/text()') print 'CHEAT NAME: ' , cheatName print 'CHEAT CODE: ' , cheatCode raw_input()
def ProcessJitenon(self): page = pickle.load(open(self._get_kanji_file_name(), "rb")) tree = html.fromstring(page.content) block = tree.xpath('//*[@id="kanjiright"]/table/tr/th') startIdx = 0 endIdx = 0 print("**** " + self._Kanji + " ****") for blk in block: if self._Kanji == "点" and len(blk.getchildren()) == 0: continue blkName, blkRow = h3_row_nb_all(self._Kanji, blk) # issue with kanji.jineton for 平 if self._Kanji == "平" and blkName == '訓読み': blkRow += 1 if self._Kanji == "平" and blkName == '意味': blkRow -= 1 if self._Kanji == "点" and blkName == '意味': blkRow += 1 if self._Kanji == '袒' and blkName == '訓読み': blkRow -= 1 startIdx = endIdx endIdx += blkRow print("Block " + blkName + ", nb row: " + str(blkRow) + " [" + str(startIdx) + ";" + str(endIdx) + "].") subblock = tree.xpath('//*[@id="kanjiright"]/table/tr/td') for idx in range(startIdx, endIdx): if blkName in ['部首', '画数', '音読み', '訓読み', '漢字検定', '学年']: if self._Kanji == '禺' and blkName == '訓読み': content = subblock[idx].text elif self._Kanji == '袤' and blkName == '訓読み' and idx == 3: content = subblock[idx].text else: content = subblock[idx].getchildren()[0].text elif blkName in ['Unicode']: content = subblock[idx].text elif blkName in ['種別']: if len(subblock[idx].getchildren()) > 0: content = subblock[idx].getchildren()[0].text elif blkName in ['異体字']: content = lxml.html.tostring(subblock[idx], encoding='unicode') if '新字体' in content: kind = '新字体' elif '標準字体' in content: kind = '標準字体' else: kind = None content = None if kind: link = subblock[idx].getchildren()[0].getchildren( )[0].attrib if 'href' in link: content = (kind, link['href']) elif blkName in ['意味']: content = lxml.html.tostring(subblock[idx], encoding='unicode') h = html2text.HTML2Text() h.ignore_links = True content = h.handle(content) # m = re.search("<td>(.*)</td>", content, flags=re.MULTILINE) # content = m[1] elif blkName in ['JIS水準']: if len(subblock[idx].getchildren()) > 0: content = subblock[idx].getchildren()[0].text else: content = subblock[idx].text self._jitenonItem[blkName].append(content) print(self._jitenonItem[blkName])