Esempio n. 1
0
def Download_Geckodriver():  #Windows

    #Download and extract geckodriver
    try:
        response = requests.get(
            "https://github.com/mozilla/geckodriver/releases/")
        tree = html.fromstring(response.content)
        tree = tree.xpath(
            '/html/body/div[4]/div/main/div[2]/div/div[3]/div[1]/div/div[2]/div[1]/div/div/a'
        )

        if platform.architecture()[0] == '64bit':
            Zipname = "geckodriver-" + tree[0].text_content() + "-win64.zip"
        else:
            Zipname = "geckodriver-" + tree[0].text_content() + "-win32.zip"

        response = requests.get(
            'https://github.com/mozilla/geckodriver/releases/download/' +
            tree[0].text_content() + "/" + Zipname,
            stream=True)
        if response.status_code == 200:
            with open(Zipname, 'wb') as out_file:
                shutil.copyfileobj(response.raw, out_file)
            del response
    except:
        print(
            "Could not Download and write newest version of Geckodriver ... ")

    try:
        if os.path.isfile("geckodriver.exe"):
            os.remove("geckodriver.exe")
        zip = zipfile.ZipFile(Zipname)
        zip.extractall()
    except:
        print("Could not be extracted ... ")
Esempio n. 2
0
def get_image_amount(url):
    #这里就相当于重复造轮子了,因为基本的代码逻辑跟上一个函数一模一样。想要简单的话就是定义一个元组,然后把获取标题、获取链接、获取图片总数的3组函数的逻辑揉在一起,最后将结果作为元组输出。不过作为新手教程,还是以简单易懂为好吧。想挑战的同学可以试试写元组模式
    response = requests.get(url).content
    selector = html.fromstring(response)
    image_amount = selector.xpath("//div[@class='page']/a[last()-1]/text()")[0]
    # a标签的倒数第二个区块就是图片集的最后一页,也是图片总数,所以直接取值就可以
    return image_amount
Esempio n. 3
0
def get_image_title(url):
    #现在进入到套图的详情页面了,现在要把套图的标题和图片总数提取出来
    response = requests.get(url).content
    selector = html.fromstring(response)
    image_title = selector.xpath("//h2/text()")[0]
    #需要注意的是,xpath返回的结果都是序列,所以需要使用[0]进行定位
    return image_title
Esempio n. 4
0
def get_image_detail_website(url):
    #这里还是重复造轮子。
    response = requests.get(url).content
    selector = html.fromstring(response)
    image_detail_websites = []
    image_amount = selector.xpath("//div[@class='page']/a[last()-1]/text()")[0]
    #这里重复构造变量,主要是为了获取图片总数。更高级的方法是使用函数间的传值,但是我忘了怎么写了,所以用了个笨办法。欢迎大家修改
    #构建图片具体地址的容器
    for i in range(int(image_amount)):
        image_detail_link = '{}/{}'.format(url, i + 1)
        response = requests.get(image_detail_link).content
        sel = html.fromstring(response)
        image_download_link = sel.xpath(
            "//div[@class='content']/a/img/@src")[0]
        #这里是单张图片的最终下载地址
        image_detail_websites.append(image_download_link)
    return image_detail_websites
Esempio n. 5
0
def init_ijdoukun():
    all_contents = b''
    page_addr = 'https://joyokanji.info/iji.html?'
    for page_idx in [
            "a", "ka", "sa", "ta", "na", "ha", "ma", "ya", "ra", "wa"
    ]:
        page = requests.get(page_addr + page_idx)
        all_contents = all_contents + page.content
    return html.fromstring(all_contents)
Esempio n. 6
0
 def tmp_enrich_with_reading_table(self, read_tbl):
     tree = html.fromstring(read_tbl)
     read_list = tree.xpath('//*[@class="C_read"]')
     #read_list = tree.cssselect('//[@class="C_read"]')
     for read_item in read_list:
         content = lxml.html.tostring(read_item, encoding='unicode')
         targ = '<td class="C_read">' + self.formated + '</td>'
         if content == targ:
             print(self.formated + self.reading)
    def download_html(self, url: str):
        content = self.load_url_content(url)
        if content is None:
            page = requests.get(url, allow_redirects=True)
            content = page.content if page.status_code == 200 else b""
            self.save_url_content(url, content)

        if content is None:
            return None

        tree = html.fromstring(content) if content else None
        return tree
Esempio n. 8
0
def get_page_number(num):
    #构建函数,用来查找该页内所有图片集的详细地址。目前一页包含15组套图,所以应该返回包含15个链接的序列。
    url = 'http://www.mmjpg.com/home/' + num
    #构造每个分页的网址
    response = requests.get(url).content
    #调用requests库,获取二进制的相应内容。注意,这里使用.text方法的话,下面的html解析会报错,大家可以试一下。这里涉及到.content和.text的区别了。简单说,如果是处理文字、链接等内容,建议使用.text,处理视频、音频、图片等二进制内容,建议使用.content。
    selector = html.fromstring(response)
    #使用lxml.html模块构建选择器,主要功能是将二进制的服务器相应内容response转化为可读取的元素树(element tree)。lxml中就有etree模块,是构建元素树用的。如果是将html字符串转化为可读取的元素树,就建议使用lxml.html.fromstring,毕竟这几个名字应该能大致说明功能了吧。
    urls = []
    #准备容器
    for i in selector.xpath("//ul/li/a/@href"):
        #利用xpath定位到所有的套图的详细地址
        urls.append(i)
        #遍历所有地址,添加到容器中
    return urls
Esempio n. 9
0
    def ProcessJitenon(self):
        page = pickle.load(open(self._get_kanji_file_name(), "rb"))
        tree = html.fromstring(page.content)

        block = tree.xpath('//*[@id="kanjiright"]/table/tr/th')
        startIdx = h3_row_nb(block[0]) + h3_row_nb(block[1])
        endIdx = startIdx + h3_row_nb(block[2]) + h3_row_nb(block[3])

        # issue with kanji.jineton for 平
        if self._Kanji == "平":
            endIdx += 1

        block = tree.xpath('//*[@id="kanjiright"]/table/tr/td')
        for idx in range(startIdx, endIdx):
            read = block[idx].getchildren()[0].text
            self._readingList.append(CYomi(read, "△" in block[idx].text))
 def get_meta(self):
     plain_html = self.response_text
     h = html2text.HTML2Text()
     h.ignore_links = True
     plain_text = h.handle(plain_html)
     text_obj = html.fromstring(plain_html.lower())
     try:
         title = text_obj.xpath('//title/text()')[0].strip()
     except Exception:
         title = 'None'
     try:
         description = text_obj.xpath(
             '//meta[@name="description"]/@content')[0].strip()
     except Exception:
         description = 'None'
     return [title, description, plain_text, plain_html]
Esempio n. 11
0
    def CargarUrl(self, url):

        from lxml import etree, html
        from urllib import request

        url_partes = url.split('://', 2)
        url = '%s://%s' % (url_partes[0], url_partes[-1].replace('//', '/'))

        req = request.Request(url, headers={'User-Agent': "Magic Browser"})
        page_content = request.urlopen(req).read()
        page_content = re.sub(b'\<\!\[CDATA\[(.+)\]\]\>', br'\1', page_content)
        page_elem = html.fromstring(page_content)

        #parser = etree.XMLParser(strip_cdata=False)
        #page_elem = etree.XML(page_content, parser)

        return page_elem
Esempio n. 12
0
def Check_for_MangaUpdate(Url, Old_Release_Text):
    try:
        response = requests.get(Url, stream=True)
        if response.status_code == 200:
            tree = html.fromstring(response.content)
            Title = tree.xpath(
                '/html/body/div[2]/div[2]/div[2]/div[2]/div/div[2]/div[1]/div[1]/span[1]'
            )[0].text
            Release = tree.xpath(
                '//*[@id="main_content"]/div[2]/div[1]/div[3]/div[12]')
            Release_Text = Release[0].text_content().split(' by')[0]
        else:
            return 0, response.status_code, ""

        if Release_Text == Old_Release_Text:
            return 0, "", ""
        else:
            return 1, Release_Text, Title
    except Exception as e:
        print(e + " Could not check " + Url + " for Updates ...")
Esempio n. 13
0
def Download_Ublockorigin():

    #Download Ublockorigin
    try:
        response = requests.get("https://github.com/gorhill/uBlock/releases/")
        tree = html.fromstring(response.content)
        tree = tree.xpath(
            '/html/body/div[4]/div/main/div[2]/div/div[3]/div[1]/div/div[2]/div[1]/div/div/a'
        )

        xpiname = 'uBlock0_' + tree[0].text_content() + '.firefox.signed.xpi'

        response = requests.get(
            'https://github.com/gorhill/uBlock/releases/download/' +
            tree[0].text_content() + "/" + xpiname,
            stream=True)
        if response.status_code == 200:
            with open(xpiname, 'wb') as out_file:
                shutil.copyfileobj(response.raw, out_file)
    except:
        print(
            "Could not Download and write newest version of Ublockorigin ... ")
Esempio n. 14
0
async def miiverse(ctx, *url):
	if not url:
		return await main.say("```\nmiiverse [post URL, e.g. AYEBAAAEAAB2UZ8mAzTspw]```")
	if url[0] == 'AYEBAAAEAAB2UZ8mAzTspw':
		return await main.say("haha that isn't actually a post")
	await main.send_typing(ctx.message.channel)
	try:
		srv = urllib.request.urlopen("https://miiverse.nintendo.net/posts/{0}/embed".format(url[0]))
	except Exception as e:
		return await main.say("```\n" + str(e) + "```")
	ftree = html.fromstring(srv.read().decode())
	drawing = ftree.xpath('//*[@id="post-content"]/div/p/img/@src')
	post = ftree.xpath('//*[@id="post-content"]/div/p/text()')
	screenshot = ftree.xpath('//*[@id="post-content"]/div/div[1]/img/@src')
	thing = ""
	if drawing:
		thing += "c'est une belle peinture\n\n" + drawing[0]
	else:
		thing += post[0]
	if screenshot:
		thing += "\n" + screenshot[0]
	return await main.say(thing)
Esempio n. 15
0
 PERSON_SAMPLE_SIZE = 1000
 # initialize array
 full = []
 # get website from config
 for url in my_config.config_values['name_generator_url']:
     logger.info('Looking at : %s', url)
     # repeat for sample size
     for i in range(0, PERSON_SAMPLE_SIZE):
         content = {}
         # sleep random time
         time.sleep(random.randint(10, 20))
         # get the page
         page_content, page_found = get_page(url)
         # page_content=requests.get(url)
         logger.info('page_content type : %s', type(page_content))
         tree = html.fromstring(page_content)
         #
         address = tree.xpath(
             '/html/body/div[3]/div[1]/div[7]/div[12]//text()')
         website = tree.xpath(
             '/html/body/div[3]/div[1]/div[7]/div[18]//text()')
         card_number = tree.xpath(
             '/html/body/div[3]/div[1]/div[7]/div[24]//text()')
         card_expire = tree.xpath(
             '/html/body/div[3]/div[1]/div[7]/div[26]//text()')
         security_code = tree.xpath(
             '/html/body/div[3]/div[1]/div[7]/div[28]//text()')
         occupation = tree.xpath(
             '/html/body/div[3]/div[1]/div[7]/div[30]//text()')
         company = tree.xpath(
             '/html/body/div[3]/div[1]/div[7]/div[32]//text()')
import lxml
import requests
import html
response = requests.get('http://packtpub.com/tech/python')
tree = html.fromstring(response.content)

books = tree.xpath('//div[@class=""price-wrapper]') / text()
print(books)
Esempio n. 17
0
def rscrape():
    order = [
        'name', 'type', 'reviews', 'phone', 'address', 'url', 'description',
        'profile'
    ]
    files = listdir('**removed**')
    for f in files[:1]:
        s = requests.Session()
        s.mount('file://', FileAdapter())

        resp = s.get('file:///C:/Users/**removed**/Documents/**removed**/' + f)
        ##                r = requests.get(SITE +f,
        ##                                 headers={'User-Agent':'Mozilla/5.0'})
        tree = html.fromstring(resp.content)

        rev = [
            r[:r.find('\n')] for r in tree.xpath(
                '//div[@class="l-small-top-space l-small-bottom-space verified-reviews"]/a[@href]/text()'
            )
        ]
        # indexe of reviews >= 20
        indexes = []
        for r in range(len(rev)):
            if int(rev[r]) > 19:
                indexes.append(r)

        # dict of contractors
        contractor = {}

        contractor['name'] = [
            tree.xpath('//span[@itemprop="name"]/text()')[i] for i in indexes
        ]

        c_type = tree.xpath('//h1[@class="t-header-secondary"]/text()')[0]
        contractor['type'] = c_type[:c_type.find(
            'Com') - 1] if c_type[:2] == 'Wi' else c_type[:c_type.find('Cont'
                                                                       ) - 1]

        contractor['reviews'] = [rev[i] for i in indexes]

        contractor['phone'] = [
            tree.xpath('//span[@itemprop="telephone"]/text()')[i]
            for i in indexes
        ]

        contractor['address'] = [
            tree.xpath('//span[@itemprop="streetAddress"]/text()')[i] + ', ' +
            tree.xpath('//span[@itemprop="addressLocality"]/text()')[i] +
            ', ' + tree.xpath('//span[@itemprop="addressRegion"]/text()')[i] +
            ' ' + tree.xpath('//span[@itemprop="postalCode"]/text()')[i]
            for i in indexes
        ]

        for link in [
                tree.xpath(
                    '//div[@class="l-small-top-space l-small-bottom-space verified-reviews"]/a/@href'
                )[i] for i in indexes
        ]:
            l2 = SITE + link[:link.find('#')]
            print(l2)

            req = requests.get(l2, headers={'User-Agent': 'Mozilla/5.0'})
            tree2 = html.fromstring(req.text)
            print(tree2)
            contractor['url'] = []
            contractor['description'] = []
            contractor['profile'] = []

            try:
                contractor['url'].append(
                    tree2.xpath('//a[@rel="nofollow"]/text()'))
            except:
                contractor['url'].append('')

            # Add try block if this ever doesn't exist
            desc = tree2.xpath('//p[@class="t-heavy"]/text()')
            print(desc)
            contractor['description'].append(desc[0])
            contractor['profile'].append(desc[1])

        print(contractor)
Esempio n. 18
0
__author__ = 'dixon'
import lxml import html
import requests

page = requests.get('http://econpy.pythonanywhere.com/ex/001.html')
tree = html.fromstring(page.text)

print(tree)




Esempio n. 19
0
def rscrape():
        order = ['name','type','reviews','phone','address','url','description','profile']
        files = listdir('**removed**')
        for f in files[:1]:
                s = requests.Session()
                s.mount('file://', FileAdapter())

                resp = s.get('file:///C:/Users/**removed**/Documents/**removed**/'+f)
##                r = requests.get(SITE +f,
##                                 headers={'User-Agent':'Mozilla/5.0'})
                tree = html.fromstring(resp.content)

                rev = [r[:r.find('\n')] for r in tree.xpath(
                        '//div[@class="l-small-top-space l-small-bottom-space verified-reviews"]/a[@href]/text()')]
                # indexe of reviews >= 20
                indexes = []
                for r in range(len(rev)):
                        if int(rev[r]) > 19:
                                indexes.append(r)

                # dict of contractors
                contractor = {}
                                
                contractor['name'] = [tree.xpath('//span[@itemprop="name"]/text()')[i] for i in indexes]

                c_type = tree.xpath('//h1[@class="t-header-secondary"]/text()')[0]
                contractor['type'] = c_type[:c_type.find('Com')-1] if c_type[:2]=='Wi' else c_type[:c_type.find('Cont')-1]

                contractor['reviews'] = [rev[i] for i in indexes]

                contractor['phone'] = [tree.xpath(
                        '//span[@itemprop="telephone"]/text()')[i] for i in indexes]

                contractor['address'] = [tree.xpath(
                        '//span[@itemprop="streetAddress"]/text()'
                        )[i] +', '+ tree.xpath(
                                '//span[@itemprop="addressLocality"]/text()')[i] +', '+
                                         tree.xpath('//span[@itemprop="addressRegion"]/text()'
                                                 )[i] +' '+ tree.xpath(
                                                 '//span[@itemprop="postalCode"]/text()'
                                                 )[i] for i in indexes]


                for link in [tree.xpath(
                        '//div[@class="l-small-top-space l-small-bottom-space verified-reviews"]/a/@href'
                        )[i] for i in indexes]:
                        l2 = SITE + link[:link.find('#')]
                        print(l2)

                        req = requests.get(l2,
                                           headers={'User-Agent':'Mozilla/5.0'})
                        tree2 = html.fromstring(req.text)
                        print(tree2)
                        contractor['url'] = []
                        contractor['description'] = []
                        contractor['profile'] = []

                        try:
                                contractor['url'].append(tree2.xpath('//a[@rel="nofollow"]/text()'))
                        except:
                                contractor['url'].append('')
                        
                        # Add try block if this ever doesn't exist
                        desc = tree2.xpath('//p[@class="t-heavy"]/text()')
                        print(desc)
                        contractor['description'].append(desc[0])
                        contractor['profile'].append(desc[1])
                        
                print (contractor)
 def parser_html(html_str):
     return html.fromstring(html_str)
Esempio n. 21
0
import html
import requests
import os

os.system("color 0a")

print("\n \n")
print("Grand Theft Auto V cheat searcher \n")
print("\n \n")

iNput = raw_input("Search ~>> ")

pullPage = requests.get("http://www.cheatcc.com/ps4/grandtheftauto5cheatscodes.html")

pushPage = html.fromstring(pullPage.content)


cheatName = tree.xpath('//b/text()')
cheatCode = tree.xpath('//p/text()')




print 'CHEAT NAME: ' , cheatName
print 'CHEAT CODE: ' , cheatCode


raw_input()
Esempio n. 22
0
    def ProcessJitenon(self):
        page = pickle.load(open(self._get_kanji_file_name(), "rb"))
        tree = html.fromstring(page.content)

        block = tree.xpath('//*[@id="kanjiright"]/table/tr/th')
        startIdx = 0
        endIdx = 0
        print("**** " + self._Kanji + " ****")
        for blk in block:
            if self._Kanji == "点" and len(blk.getchildren()) == 0: continue

            blkName, blkRow = h3_row_nb_all(self._Kanji, blk)

            # issue with kanji.jineton for 平
            if self._Kanji == "平" and blkName == '訓読み':
                blkRow += 1
            if self._Kanji == "平" and blkName == '意味':
                blkRow -= 1
            if self._Kanji == "点" and blkName == '意味':
                blkRow += 1
            if self._Kanji == '袒' and blkName == '訓読み':
                blkRow -= 1

            startIdx = endIdx
            endIdx += blkRow
            print("Block " + blkName + ", nb row: " + str(blkRow) + " [" +
                  str(startIdx) + ";" + str(endIdx) + "].")
            subblock = tree.xpath('//*[@id="kanjiright"]/table/tr/td')
            for idx in range(startIdx, endIdx):
                if blkName in ['部首', '画数', '音読み', '訓読み', '漢字検定', '学年']:
                    if self._Kanji == '禺' and blkName == '訓読み':
                        content = subblock[idx].text
                    elif self._Kanji == '袤' and blkName == '訓読み' and idx == 3:
                        content = subblock[idx].text
                    else:
                        content = subblock[idx].getchildren()[0].text
                elif blkName in ['Unicode']:
                    content = subblock[idx].text
                elif blkName in ['種別']:
                    if len(subblock[idx].getchildren()) > 0:
                        content = subblock[idx].getchildren()[0].text
                elif blkName in ['異体字']:
                    content = lxml.html.tostring(subblock[idx],
                                                 encoding='unicode')

                    if '新字体' in content:
                        kind = '新字体'
                    elif '標準字体' in content:
                        kind = '標準字体'
                    else:
                        kind = None

                    content = None
                    if kind:
                        link = subblock[idx].getchildren()[0].getchildren(
                        )[0].attrib
                        if 'href' in link:
                            content = (kind, link['href'])

                elif blkName in ['意味']:
                    content = lxml.html.tostring(subblock[idx],
                                                 encoding='unicode')
                    h = html2text.HTML2Text()
                    h.ignore_links = True
                    content = h.handle(content)
                    # m = re.search("<td>(.*)</td>", content, flags=re.MULTILINE)
                    # content = m[1]
                elif blkName in ['JIS水準']:
                    if len(subblock[idx].getchildren()) > 0:
                        content = subblock[idx].getchildren()[0].text
                    else:
                        content = subblock[idx].text
                self._jitenonItem[blkName].append(content)

            print(self._jitenonItem[blkName])