Beispiel #1
0
Datei: news.py Projekt: pyfun/msf
 def build_newone(self, table):
     #        html = E.HTML(E.HEAD(),E.BODY(table))
     head = """<html><body>"""
     foot = """</body></html>"""
     html = " ".join([tostring(i) for i in table])
     page = fromstring(head + html + foot)
     open_in_browser(page)
Beispiel #2
0
 def render_in_browser(self, **kwargs):
     """Render the graph, open it in your browser with black magic"""
     try:
         from lxml.html import open_in_browser
     except ImportError:
         raise ImportError('You must install lxml to use render in browser')
     open_in_browser(self.render_tree(**kwargs), encoding='utf-8')
Beispiel #3
0
 def render_in_browser(self):
     """Render the graph, open it in your browser with black magic"""
     try:
         from lxml.html import open_in_browser
     except ImportError:
         raise ImportError('You must install lxml to use render in browser')
     open_in_browser(self.render_tree(), encoding='utf-8')
Beispiel #4
0
    def open_in_browser(response):
        """Provide a simple interface for `lxml.html.open_in_browser` function.
        Be careful, use this function only for debug purpose

        :param response:
        :return:
        """
        lh.open_in_browser(response.html)
Beispiel #5
0
 def render_in_browser(self, **kwargs):
     """Render the graph, open it in your browser with black magic"""
     try:
         from lxml.html import open_in_browser
     except ImportError:
         raise ImportError('You must install lxml to use render in browser')
     kwargs.setdefault('force_uri_protocol', 'https')
     open_in_browser(self.render_tree(**kwargs), encoding='utf-8')
Beispiel #6
0
 def render_in_browser(self, **kwargs):
     """Render the graph, open it in your browser with black magic"""
     try:
         from lxml.html import open_in_browser
     except ImportError:
         raise ImportError("You must install lxml to use render in browser")
     kwargs.setdefault("force_uri_protocol", "https")
     open_in_browser(self.render_tree(**kwargs), encoding="utf-8")
Beispiel #7
0
 def open_in_browser(self):
     if sys.platform == "win32":
         try:
             td_td_html = self.to_html()
             if not td_td_html:
                 return
             table = """<table border="1" cellspacing="0">{}</table>""".format(
                 td_td_html)
             open_in_browser(etree.fromstring(table))
         except:
             print("open_in_browser ERROR!")
         else:
             print("open in browser success!")
     else:
         print("{}不支持此功能".format(sys.platform))
Beispiel #8
0
def getSublists(href, hinweis):
    try:
        html = scraperwiki.scrape(href)
        root = lxml.html.fromstring(html)
        root.make_links_absolute("http://www.spar.at/")
        open_in_browser(root)
        for sublist in root.cssselect("div.left-nav li.on"):
            try:
                for element, attribute, link, pos in sublist.iterlinks():
                    #url = sublist.cssselect("li a")[0].attrib['href']
                    print "Sublist " + link
                    scrapeData(link, '"' + hinweis + '"')
            except IndexError:
                print "no href"
            except ValueError:
                print "no href"
    except ValueError:
        print "no href"
def getSublists(href, hinweis):
    try:
        html = scraperwiki.scrape(href)
        root = lxml.html.fromstring(html)
        root.make_links_absolute("http://www.spar.at/")
        open_in_browser(root)
        for sublist in root.cssselect("div.left-nav li.on"):
            try:
                    for element, attribute, link, pos in sublist.iterlinks():
                        #url = sublist.cssselect("li a")[0].attrib['href']
                        print "Sublist "+link
                        scrapeData(link,'"'+hinweis+'"') 
            except IndexError:
                print "no href"
            except ValueError:
                print "no href"
    except ValueError:
        print "no href"
import urllib
from lxml import html

url='http://dict.youdao.com/search?q='+'dictionary'
page = html.fromstring(urllib.urlopen(url).read())
collins = page.xpath('//*[@id="collinsResult"]/div/div/div/div')[0]
print "collins result: ", html.tostring(collins)
#print "type of collins", type(collins)
#print "type of page", type(page)
html.open_in_browser(collins)
Beispiel #11
0
def open_in_browser(tree, encoding = 'utf-8'):
	''' Opens a LXML tree in a browser. '''

	from lxml.html import open_in_browser
	open_in_browser(tree, encoding)
from lxml import html

events_html = html.fromstring(events0.text)
# -

# ### Using xpath to extract content from HTML
# `XPath` is a tool for identifying particular elements withing a HTML
# document. The developer tools built into modern web browsers make it
# easy to generate `XPath`s that can used to identify the elements of a
# web page that we wish to extract.
#
# We can open the html document we retrieved and inspect it using
# our web browser.

# + {"results": "'hide'"}
html.open_in_browser(events_html, encoding = 'UTF-8')
# -

# ![](img/dev_tools_right_click.png)
#
# ![](img/dev_tools_inspect.png)
#
# Once we identify the element containing the information of interest we
# can use our web browser to copy the `XPath` that uniquely identifies
# that element.
#
# ![](img/dev_tools_xpath.png)
#
# Next we can use python to extract the element of interest:

events_list_html = events_html.xpath('//*[@id="events_list"]')[0]
Beispiel #13
0
from lxml.html.diff import htmldiff
from lxml.html import parse, tostring, open_in_browser, fromstring


def get_page(url):
    doc = parse(url).getroot()
    doc.make_links_absolute()
    return tostring(doc)


def compare_pages(url1, url2, selector='body div'):
    basis = parse(url1).getroot()
    basis.make_links_absolute()
    other = parse(url2).getroot()
    other.make_links_absolute()
    el1 = basis.cssselect(selector)[0]
    el2 = other.cssselect(selector)[0]
    diff_content = htmldiff(tostring(el1), tostring(el2))
    diff_el = fromstring(diff_content)
    el1.getparent().insert(el1.getparent().index(el1), diff_el)
    el1.getparent().remove(el1)
    return basis


if __name__ == '__main__':
    import sys
    doc = compare_pages(sys.argv[1], sys.argv[2], sys.argv[3])
    open_in_browser(doc)
Beispiel #14
0
#!/usr/bin/env python3
# Copyright (c) 2012 Домоглед  <*****@*****.**>
# @author Петр Болф <*****@*****.**>

'''
Hen je program, který ...
'''

#v systému
import webbrowser

webbrowser.open(url)

#nebo v lxml,  který to výše uvedené používá

from lxml.html import open_in_browser
open_in_browser(element)
# here we setup the necessary agent to download a google html page
opener = urllib2.build_opener()
opener.addheaders = [('User-agent',
                      'Mozilla/5.0 (Windows NT 6.3; WOW64) \
                      AppleWebKit/537.36 (KHTML, like Gecko) \
                      Chrome/39.0.2171.95 Safari/537.36 \
                      OPR/26.0.1656.60')]

# let's download
google_html = opener.open(URL)

# parse the html
google_parsed = html.parse(google_html)

# Here's a smarter way to see what exactly it is you've downloaded/parsed with lxml:
html.open_in_browser(google_parsed)
#file://c:/users/rodrigo/appdata/local/temp/tmp1xllau.html

# Here comes the 'selecting'!
google_results = google_parsed.xpath('//*[@id="rso"]/div[2]')

print len(google_results)
#1

# the xpath in this line basically selects all children, which in our
# case are the 10 'li' elements
print len(google_results[0].xpath('./*'))
#10

# print out hyperlinks
# Note: after using devtool's magnifying glass and 'copy xpath', I got:
Beispiel #16
0
def view(response=None):
    if response is None:
        global data
        response = data["response"]
    open_in_browser(HtmlParser(response), response.encoding)
Beispiel #17
0
 def render_in_browser(self):
     """Render the graph, open it in your browser with black magic"""
     from lxml.html import open_in_browser
     open_in_browser(self.render_tree(), encoding='utf-8')
#
# While JSON parsing is built into the Python `requests` library, parsing HTML requires a separate library. I recommend using the HTML parser from the `lxml` library; others prefer an alternative called `beautifulsoup4`.

from lxml import html

# convert a html text representation (`events.text`) into 
# a tree-structure (DOM) html representation (`events_html`)
events_html = html.fromstring(events.text)

# ### Using XPath to extract content from HTML
#
# `XPath` is a tool for identifying particular elements within a HTML document. The developer tools built into modern web browsers make it easy to generate `XPath`s that can be used to identify the elements of a web page that we wish to extract.
#
# We can open the HTML document we retrieved and inspect it using our web browser.

html.open_in_browser(events_html, encoding = 'UTF-8')

# ![](Python/PythonWebScrape/images/dev_tools_right_click.png)
#
# ![](Python/PythonWebScrape/images/dev_tools_inspect.png)
#
# Once we identify the element containing the information of interest we can use our web browser to copy the `XPath` that uniquely identifies that element.
#
# ![](Python/PythonWebScrape/images/dev_tools_xpath.png)
#
# Next we can use Python to extract the element of interest:

events_list_html = events_html.xpath('//*[@id="events_list"]/article')

# Let's just extract the second element in our events list.
Beispiel #19
0
from lxml import html

events_html = html.fromstring(events0.text)
# -

# ### Using xpath to extract content from HTML
# `XPath` is a tool for identifying particular elements withing a HTML
# document. The developer tools built into modern web browsers make it
# easy to generate `XPath`s that can used to identify the elements of a
# web page that we wish to extract.
#
# We can open the html document we retrieved and inspect it using
# our web browser.

# + {"results": "'hide'"}
html.open_in_browser(events_html, encoding = 'UTF-8')
# -

# ![](img/dev_tools_right_click.png)
#
# ![](img/dev_tools_inspect.png)
#
# Once we identify the element containing the information of interest we
# can use our web browser to copy the `XPath` that uniquely identifies
# that element.
#
# ![](img/dev_tools_xpath.png)
#
# Next we can use python to extract the element of interest:

events_list_html = events_html.xpath('//*[@id="events_list"]')[0]
Beispiel #20
0
def compare_in_differnt_browser(comparedpage):
  from lxml.html import parse, open_in_browser
  addr = addressbar.get_text()
  url = parse(addr).getroot()
  url.make_links_absolute()
  open_in_browser(url)
Beispiel #21
0
 def open_in_browser(self, encoding='utf-8'):
     html.open_in_browser(self.root, encoding)
Beispiel #22
0
 def open_in_browser(self):
     open_in_browser(self.tree)
Beispiel #23
0
def open_in_browser(tree, encoding='utf-8'):
    ''' Opens a LXML tree in a browser. '''

    from lxml.html import open_in_browser
    open_in_browser(tree, encoding)
Beispiel #24
0
 def show(self):
     open_in_browser(self.source)
Beispiel #25
0
def view(response=None):
    if response is None:
        global data
        response = data["response"]
    open_in_browser(HtmlParser(response), response.encoding)
#!/usr/bin/env python3
# Copyright (c) 2012 Домоглед  <*****@*****.**>
# @author Петр Болф <*****@*****.**>

"""
Hen je program, který ...
"""

# v systému
import webbrowser

webbrowser.open(url)

# nebo v lxml,  který to výše uvedené používá

from lxml.html import open_in_browser

open_in_browser(element)
Beispiel #27
0
 def render_in_browser(self):
     """Render the graph, open it in your browser with black magic"""
     from lxml.html import open_in_browser
     open_in_browser(self.render_tree(), encoding='utf-8')
def get_captcha(captcha_src):
    html.open_in_browser(html.fromstring(f"<img src='{captcha_src}' />"))
    return input("Enter the captcha displayed in browser: ")