def news():
    global url
    global ns
    global headers

    opener = urllib2.build_opener()
    opener.addheaders = headers

    pagetext = opener.open(url)
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(pagetext)
    main = page.find("//%sdiv[@class='centre-wide-main-content-column']"%ns)
    for entry in main.findall("%sdiv"%ns):
        title = entry.find("%sdiv[@class='news-item news-title']"%ns).text.strip()

        number = int(filter(lambda c: c in string.digits, (entry.attrib.get("onclick","0"))))
        url = "http://www.guernseyfc.com/news.details.php?id=%d&random=%s"%(number,ourhash(number))

        head_tag = entry.find("%sdiv[@class='news-item news-brief-descript']/%stable/%stbody/%str/%std/%sh1"%(ns,ns,ns,ns,ns,ns))
        if head_tag is None:
            head = ""
        else:
            head = head_tag.text
        
        scraperwiki.sqlite.save(unique_keys=["number"],data={"title":title, "number":number, "url":url, "head":head})
Ejemplo n.º 2
0
def test_parser_encoding(data, encoding):
    p = HTMLParser()
    assert p.documentEncoding is None
    p.parse(data, useChardet=False)
    encoding = encoding.lower().decode("ascii")

    assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
def wiki_string_to_tiddlers(content):
    """
    Turn a string that is a wiki into tiddler.
    """
    parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
    doc = parser.parse(content)
    # minidom will not provide working getElementById without
    # first having a valid document, which means some very specific
    # doctype hooey. So we traverse
    body = doc.getElementsByTagName('body')[0]
    body_divs = body.getElementsByTagName('div')
    is_wiki = False
    for div in body_divs:
        if div.hasAttribute('id') and div.getAttribute('id') == 'storeArea':
            divs = div.getElementsByTagName('div')
            is_wiki = True
            break

    if is_wiki:
        tiddlers = []
        for tiddler_div in divs:
            tiddlers.append(_get_tiddler_from_div(tiddler_div))
        return tiddlers
    else:
        raise ValueError('content not a tiddlywiki 2.x')
Ejemplo n.º 4
0
Archivo: o2.py Proyecto: d-fens/smspie
	def login(self, username, password):
		"""
		Login to o2online.ie

		Returns true if successful or false if fails.
		"""
		if self.resumable():
			self.logger.info("Resuming from login.")
			return True
		else:
			self.logger.info("Unable to resume, running connect from login.")
			self.connect()

		post = [
			('IDButton', 'Go'),
			('org', 'o2ext'),
			('CONNECTFORMGET', 'TRUE'),
			('IDToken1', username),
			('IDToken2', password)
		]

		handle = self.post('https://www.o2online.ie/amserver/UI/Login', post)
		from html5lib import HTMLParser, treebuilders
		parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
		soup = parser.parse(handle)

		if unicode(soup.html.head.title.string).strip() == u"LoginCheck":
			self.logger.info("login has correct HTML title.")
			return True
		return False
    def extract_html_urls(self, html):
        """
        Take all ``<img src="..">`` from the HTML
        """
        p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        dom = p.parse(html)
        urls = []

        for img in dom.getElementsByTagName('img'):
            src = img.getAttribute('src')
            if src:
                urls.append(unquote_utf8(src))

            srcset = img.getAttribute('srcset')
            if srcset:
                urls += self.extract_srcset(srcset)

        for source in dom.getElementsByTagName('source'):
            srcset = source.getAttribute('srcset')
            if srcset:
                urls += self.extract_srcset(srcset)

        for source in dom.getElementsByTagName('a'):
            href = source.getAttribute('href')
            if href:
                urls.append(unquote_utf8(href))

        return urls
Ejemplo n.º 6
0
def shallow_scrape():
    urns = set([])

    br = mechanize.Browser()
    resultspage = br.open("http://www.education.gov.uk/edubase/quickSearchResult.xhtml")

    moreWorkToDo = True
    c = 1

    while moreWorkToDo and (c<3):
        print "Handling page %d..."%c
    
        ### extract data from page
        parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
        page = parser.parse(resultspage)

        for u in page.getroot().findall(path(["body","div","div","div","div","table","tbody","tr","td","table","tbody","tr","td","a"],"")):
            #href = u.attrib.get("href","")
            href = u.get("href")
            print "href: %s"%href
            urn = re.search("urn=([0-9]{6})",href).group(1)
            urns.add(urn)
            print "%s, "%urn
        print

        ### get new page
        try:
            resultspage = br.follow_link(text="Next")
            c += 1
        except mechanize.LinkNotFoundError:
            moreWorkToDo = False

    return urns
Ejemplo n.º 7
0
def scraper(request):
    post_data = {
            'classyear' : '2008', # why??
            'subj': 'COSC',
            'crsenum': '50'
        }
    url = 'http://oracle-www.dartmouth.edu/dart/groucho/timetable.course_quicksearch'

    
    # scrape the html
    cj = cookielib.LWPCookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)
    headers =  {'User-agent' : 'Mozilla/c.0 (compatible; MSIE 5.5; Windows NT)'}
    request = urllib2.Request(url, urllib.urlencode(post_data), headers)
    handle = urllib2.urlopen(request)
    html = handle.read()

    # parse for the dept and course number
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(html)
    #tbody = soup.find('th', text='Term').parent.parent.parent
    #soup = tbody.findAll('tr')[2]('td')
    

    return render_to_response("scraper.html", {'soup': soup})
Ejemplo n.º 8
0
def get_spaces_available(dept_abbr, course_num):
	# define
	post_data = {
		'classyear' : '2008', #don't know WHY!?!
		'subj': dept_abbr,
		'crsenum': course_num,
	}
	url = 'http://oracle-www.dartmouth.edu/dart/groucho/timetable.course_quicksearch'

	# get the html
	cj = cookielib.LWPCookieJar()
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
	urllib2.install_opener(opener)
	headers =  {'User-agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
	request = urllib2.Request(url, urllib.urlencode(post_data), headers)
	handle = urllib2.urlopen(request)
	html = handle.read()

	# parse the html
	parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
	soup = parser.parse(html)
	tbody = soup.find('th', text='Term').parent.parent.parent
	cells = tbody.findAll('tr')[2]('td')
	enrolled = int(cells[-2].contents[0])
	capacity = int(cells[-3].contents[0])

	print "%i spaces left (capacity of %i with %i enrolled)" % (capacity-enrolled, capacity, enrolled)
Ejemplo n.º 9
0
Archivo: html.py Proyecto: riffm/iktomi
 def get_dom(self, buf):
     buf = buf.strip()
     if not buf:
         return None
     p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                             tokenizer=self.token_sanitizer())
     return p.parseFragment(buf)
Ejemplo n.º 10
0
def clean_html(input, sanitize=False):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    :param sanitize: Remove unwanted HTML tags and attributes.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    parser_kwargs = {}
    serializer_kwargs = {}
    if sanitize:
        if HTMLSanitizer is None:
            # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016)
            serializer_kwargs["sanitize"] = True
        else:
            parser_kwargs["tokenizer"] = HTMLSanitizer

    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs)
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs)
    return "".join(s.serialize(stream))
Ejemplo n.º 11
0
def runParserEncodingTest(data, encoding):
    p = HTMLParser()
    assert p.documentEncoding is None
    p.parse(data, useChardet=False)
    encoding = encoding.lower().decode("ascii")

    assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
Ejemplo n.º 12
0
 def __init__(self, api='etree'):
     # if no default implementation is defined for this api, set it to None
     # to let getTreeBuilder() using the corresponding implementation.
     implementation = self.defaults.get(api, None)
     HTMLParser.__init__(self,
                         tree=treebuilders.getTreeBuilder(
                             api, implementation))
Ejemplo n.º 13
0
def html_parser(html):
    try:
        soup = BeautifulSoup(html)
    except:
        parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
        soup = parser.parse(html)
    return soup
Ejemplo n.º 14
0
def runParserEncodingTest(data, encoding):
    p = HTMLParser()
    p.parse(data, useChardet=False)
    encoding = encoding.lower().decode("ascii")

    assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage(
        data, encoding, p.tokenizer.stream.charEncoding[0])
Ejemplo n.º 15
0
def wiki_string_to_tiddlers(content):
    """
    Turn a string that is a TiddlyWiki into individual tiddlers.
    """
    parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
    doc = parser.parse(content)
    # minidom will not provide working getElementById without
    # first having a valid document, which means some very specific
    # doctype hooey. So we traverse
    body = doc.getElementsByTagName('body')[0]
    body_divs = body.getElementsByTagName('div')
    is_wiki = False
    for div in body_divs:
        if div.hasAttribute('id') and div.getAttribute('id') == 'storeArea':
            divs = div.getElementsByTagName('div')
            is_wiki = True
            break

    if is_wiki:
        tiddlers = []
        for tiddler_div in divs:
            tiddlers.append(_get_tiddler_from_div(tiddler_div))
        return tiddlers
    else:
        raise ValueError('content not a tiddlywiki 2.x')
Ejemplo n.º 16
0
def summary_scrape(urn):
    print " - summary"
    url = "http://www.edubase.gov.uk/establishment/summary.xhtml?urn=" + urn
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(scrape(url))

    keyvaluepairs = table_extract(page)

    raw_address = [x.strip() for x in keyvaluepairs.pop("").split(" / ")]
    if postcode.match(raw_address[-1]):
        keyvaluepairs["Postcode"] = raw_address[-1]
        raw_address = raw_address[:-1]
    keyvaluepairs["Address"] = " / ".join(raw_address)

    for t in page.findall(
            path([
                "body", "div", "div", "div", "div", "table", "tbody", "tr",
                "td", "h1"
            ], pre)):
        x = t.text.split(": ")
        keyvaluepairs[x[0]] = x[1]

    for t in page.findall(
            path([
                "body", "div", "div", "div", "div", "table", "tbody", "tr",
                "td", "div", "p", "b"
            ], pre)):
        keyvaluepairs[t.text.strip().strip(":")] = (t.tail or "").strip()

    return keyvaluepairs
Ejemplo n.º 17
0
Archivo: util.py Proyecto: mzmttks/jsm
def html_parser(html):
    try:
        soup = BeautifulSoup(html)
    except:
        parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
        soup = parser.parse(html)
    return soup
def do_year(y, url):
    pagetext = urllib2.urlopen(url)
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"),
                        tokenizer=sanitizer.HTMLSanitizer)
    page = parser.parse(pagetext)

    for section in page.findall(
            "body/div/div/div/div/div/div/div/div/table[@class='fixture']"):

        matchtype = section.find("caption").text

        for match in section.findall("tbody/tr"):

            l = list(match.getchildren())
            d = {}
            d["Match type"] = matchtype
            d["Match number"] = l[0].text
            d["Date"] = make_date(l[1].text, y)
            d["Team 1"] = flatten_refs(l[3])
            d["Team 2"] = flatten_refs(l[5])
            a = l[4].find("a")
            d["Score"] = a.text
            d["Report"] = "http://www.fifa.com" + a.get("href")
            print "%d (%s) %s - %s" % (y, d["Match type"], d["Team 1"],
                                       d["Team 2"])
            datastore.save(unique_keys=["Date", "Team 1", "Team 2"], data=d)
Ejemplo n.º 19
0
def runParserEncodingTest(data, encoding):
    p = HTMLParser()
    assert p.documentEncoding is None
    p.parse(data, useChardet=False)
    encoding = encoding.lower().decode('ascii')

    assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
Ejemplo n.º 20
0
def clean_html(input, sanitize=False):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    :param sanitize: Remove unwanted HTML tags and attributes.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    parser_kwargs = {}
    serializer_kwargs = {}
    if sanitize:
        if HTMLSanitizer is None:
            # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016)
            serializer_kwargs['sanitize'] = True
        else:
            parser_kwargs['tokenizer'] = HTMLSanitizer

    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs)
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs)
    return "".join(s.serialize(stream))
Ejemplo n.º 21
0
def test_debug_log():
    parser = HTMLParser(debug=True)
    parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")

    expected = [('dataState', 'InitialPhase', 'InitialPhase', 'processDoctype', {'type': 'Doctype'}),
                ('dataState', 'BeforeHtmlPhase', 'BeforeHtmlPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
                ('dataState', 'BeforeHeadPhase', 'BeforeHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
                ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
                ('rcdataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
                ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'title', 'type': 'EndTag'}),
                ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
                ('dataState', 'AfterHeadPhase', 'AfterHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
                ('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
                ('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
                ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})]

    if PY2:
        for i, log in enumerate(expected):
            log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log]
            expected[i] = tuple(log)

    assert parser.log == expected
Ejemplo n.º 22
0
    def extract_html_urls(self, html):
        """
        Take all ``<img src="..">`` from the HTML
        """
        p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        dom = p.parse(html)
        urls = []

        for img in dom.getElementsByTagName("img"):
            src = img.getAttribute("src")
            if src:
                urls.append(unquote_utf8(src))

            srcset = img.getAttribute("srcset")
            if srcset:
                urls += self.extract_srcset(srcset)

        for source in dom.getElementsByTagName("source"):
            srcset = source.getAttribute("srcset")
            if srcset:
                urls += self.extract_srcset(srcset)

        for source in dom.getElementsByTagName("a"):
            href = source.getAttribute("href")
            if href:
                urls.append(unquote_utf8(href))

        return urls
Ejemplo n.º 23
0
def test_parser_args(expected, data, kwargs):
    stream = _inputstream.HTMLBinaryInputStream(data,
                                                useChardet=False,
                                                **kwargs)
    assert expected == stream.charEncoding[0].name
    p = HTMLParser()
    p.parse(data, useChardet=False, **kwargs)
    assert expected == p.documentEncoding
Ejemplo n.º 24
0
def get_first_result_index_from_quick_search_results(html):
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(html)
    block = soup.find('', {'id' : 'photoresult'}) # isolate the table of data on the first result
    block = block.findAll('', {'class' : 'photobox'})[0]
    id = block.find('p').find('a').contents[0]
    id = int(id)
    return id
Ejemplo n.º 25
0
def parse(f):
    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    doc = p.parse(f)
    walker = treewalkers.getTreeWalker("dom")

    tokens = []
    bintokens = []

    waitfor = None

    for tok in walker(doc):

        if waitfor:
            if tok["type"] == waitfor[0] and tok["name"] == waitfor[1]:
                waitfor = None
            continue

        if tok["type"] == "StartTag" and tok["name"] in ("link", "script", "style"):
            waitfor = ("EndTag", tok["name"])

        if tok["type"] in ("EndTag", "StartTag", "EmptyTag", "Comment"):
            bintokens.append(1)
            tokens.append(tok)

        elif tok["type"] in ("Characters",):
            for tok1 in tok["data"].split():
                bintokens.append(0)
                tokens.append({"type": "Characters", "data": tok1})

        elif tok["type"] in ("SpaceCharacters", "Doctype"):
            pass

        else:
            raise ValueError("unrecognizable token type: %r" % tok)

    cumbintokens = [bintokens[0]]

    for tok in bintokens[1:]:
        cumbintokens.append(cumbintokens[-1] + tok)

    length = len(cumbintokens)

    midx = None
    m = None

    for i in range(length):
        for j in range(i + 1, length):
            end_tag = cumbintokens[-1] - cumbintokens[j]
            start_tag = cumbintokens[i]
            text_between = (j - i) - (cumbintokens[j] - cumbintokens[i])
            nm = end_tag + start_tag + text_between

            if not midx or nm > m:
                midx = i, j
                m = nm

    i, j = midx
    return serialize_tokens(tokens[i:j + 1])
Ejemplo n.º 26
0
def test_productionlist(app, status, warning):
    app.builder.build_all()

    warnings = warning.getvalue().split("\n")
    assert len(warnings) == 2
    assert warnings[-1] == ''
    assert "Dup2.rst:4: WARNING: duplicate token description of Dup, other instance in Dup1" in warnings[
        0]

    with (app.outdir / 'index.html').open('rb') as f:
        etree = HTMLParser(namespaceHTMLElements=False).parse(f)
    ul = list(etree.iter('ul'))[1]
    cases = []
    for li in list(ul):
        assert len(list(li)) == 1
        p = list(li)[0]
        assert p.tag == 'p'
        text = str(p.text).strip(' :')
        assert len(list(p)) == 1
        a = list(p)[0]
        assert a.tag == 'a'
        link = a.get('href')
        assert len(list(a)) == 1
        code = list(a)[0]
        assert code.tag == 'code'
        assert len(list(code)) == 1
        span = list(code)[0]
        assert span.tag == 'span'
        linkText = span.text.strip()
        cases.append((text, link, linkText))
    assert cases == [
        ('A', 'Bare.html#grammar-token-A', 'A'),
        ('B', 'Bare.html#grammar-token-B', 'B'),
        ('P1:A', 'P1.html#grammar-token-P1-A', 'P1:A'),
        ('P1:B', 'P1.html#grammar-token-P1-B', 'P1:B'),
        ('P2:A', 'P1.html#grammar-token-P1-A', 'P1:A'),
        ('P2:B', 'P2.html#grammar-token-P2-B', 'P2:B'),
        ('Explicit title A, plain', 'Bare.html#grammar-token-A', 'MyTitle'),
        ('Explicit title A, colon', 'Bare.html#grammar-token-A', 'My:Title'),
        ('Explicit title P1:A, plain', 'P1.html#grammar-token-P1-A',
         'MyTitle'),
        ('Explicit title P1:A, colon', 'P1.html#grammar-token-P1-A',
         'My:Title'),
        ('Tilde A', 'Bare.html#grammar-token-A', 'A'),
        ('Tilde P1:A', 'P1.html#grammar-token-P1-A', 'A'),
        ('Tilde explicit title P1:A', 'P1.html#grammar-token-P1-A',
         '~MyTitle'),
        ('Tilde, explicit title P1:A', 'P1.html#grammar-token-P1-A',
         'MyTitle'),
        ('Dup', 'Dup2.html#grammar-token-Dup', 'Dup'),
        ('FirstLine', 'firstLineRule.html#grammar-token-FirstLine',
         'FirstLine'),
        ('SecondLine', 'firstLineRule.html#grammar-token-SecondLine',
         'SecondLine'),
    ]

    text = (app.outdir / 'LineContinuation.html').read_text()
    assert "A</strong> ::=  B C D    E F G" in text
def scrape_others(pct_name, url):
    types = ["doctor", "dentist", "pharmacy", "optician"]
    for facility_type, i in zip(types, range(2, 6)):
        parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
        page = parser.parse(scrape(url + "&v=%d" % i))
        root = page.getroot()

        s = root.find("body/div/form/div/div/div/div/div/dl")
        extract_table_data(pct_name, s, facility_type)
def scrape_pct(link, pct_name):
    """
    Scrapes the data associated with the PCT, and calls functions to scrape
    data associated with the services.
    """

    print
    print
    print pct_name
    print "-" * len(pct_name)

    url = "http://www.nhs.uk" + link
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(scrape(url))
    root = page.getroot()

    d = {}

    # basic contact details
    d["PCT"] = pct_name
    d["type"] = "main"
    d["name"] = pct_name
    address = root.find("body/div/form/div/div/p").text
    d["address"] = address
    postcode = geo.extract_gb_postcode(address)
    d["postcode"] = postcode
    d["latlng"] = geo.gb_postcode_to_latlng(postcode)
    d["info HTML"] = url

    # quality
    for t in root.findall(
            "body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"
    ):
        k = t.find("div/h4").text.strip()
        v = t.find("div/img").attrib["alt"]
        d[k] = v

    # head honcho
    for t in root.findall(
            "body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"
    ):
        d["Boss"] = t.text.replace("<br />", ", ")

    # boring text
    for t in root.findall("body/div/form/div/div/div/div/div/div/p"):
        if t.text:
            if t.attrib.get("class", False) == "intro":
                d["intro text"] = t.text
            else:
                d["boilerplate"] = d.get("boilerplate", "") + "\n" + t.text

    datastore.save(unique_keys=["PCT", "type", "name", "address"],
                   data=d,
                   latlng=d.get("latlng"))

    scrape_facilities(pct_name, root)
    scrape_others(pct_name, url)
Ejemplo n.º 29
0
    def body_html(self):
        body_html = self.get_part_content(self.mail_pyzmail.html_part)
        if not body_html and self.body_text:
            body_html = self.body_text.replace('\n', '<br />')

        parser = HTMLParser(tokenizer=HTMLSanitizer)
        parser.parse(body_html)

        return body_html
def scrape_others(pct_name,url):
    types = ["doctor","dentist","pharmacy","optician"]
    for facility_type,i in zip(types,range(2,6)):
        parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
        page = parser.parse(scrape(url+"&v=%d"%i))
        root = page.getroot()

        s = root.find("body/div/form/div/div/div/div/div/dl")
        extract_table_data(pct_name,s,facility_type)
Ejemplo n.º 31
0
    def body_html(self):
        body_html = self.get_part_content(self.mail_pyzmail.html_part)
        if not body_html and self.body_text:
            body_html = self.body_text.replace('\n', '<br />')

        parser = HTMLParser(tokenizer=HTMLSanitizer)
        parser.parse(body_html)

        return body_html
Ejemplo n.º 32
0
def sanitize(content):
    parser = HTMLParser(tokenizer = sanitizer.HTMLSanitizer,
                             tree = treebuilders.getTreeBuilder("dom"))
    dom = parser.parseFragment(content)
    tree_walker = treewalkers.getTreeWalker("dom")
    tree_stream = tree_walker(dom)
    serial = serializer.HTMLSerializer(omit_optional_tags = False,
                                           quote_attr_values = True)
    output = serial.serialize(tree_stream)
    return u''.join(output)
Ejemplo n.º 33
0
    def _parse_and_linkify(self, content, dest_class):
        p = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"),
                       namespaceHTMLElements=False)
        doc = p.parse(content)
        for node in doc.iter():
            self._link_finder(node)

        docstr = etree_tostring(doc, encoding=unicode)
        html_fragment = dest_class(docstr)
        return html_fragment
Ejemplo n.º 34
0
def get_first_result_index_from_quick_search_results(html):
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(html)
    block = soup.find(border="0", bgcolor="white") # isolate the table of data on the first result
    id_str = block.find('font').contents[0] #contents of first <font>
    # this should looke like: 'ID#:11901'
    # parse out the actual id and cast as int
    id = int(id_str.partition(':')[2])
    print id
    return id
Ejemplo n.º 35
0
def get_first_result_index_from_quick_search_results(html):
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(html)
    block = soup.find(
        '',
        {'id': 'photoresult'})  # isolate the table of data on the first result
    block = block.findAll('', {'class': 'photobox'})[0]
    id = block.find('p').find('a').contents[0]
    id = int(id)
    return id
Ejemplo n.º 36
0
def parse_img_html_page(html):
    if not html or html == '':
        print "wait, something terrible has happened. abort mission!"
        return None
    metadict = init_dict()
    # soupify the html
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(html)
    if not soup:
        print "wait, we couldn't make a soup. i don't know WHY..."
        return None
    # the lores image url
    metadict['url_to_lores_img'] = soup.find("",{"class": "tophoto"}).find("img")["src"]
    # the description/caption
    metadict['desc'] = encode_all_nice(soup.find("", {"class": "caption"}).contents[0].strip())

    # html table with the rest of the data
    data_table = soup.find("", {"class": "photoinfo2"}).find("tbody")

    metadict['url_to_hires_img'] = find_by_tag_and_contents(data_table, "a", u"\u00BB Download original photo")['href']
    # TODO: for now, we just assume that the thumb image url follows a pattern
    # maybe we should really do a search for this image's id and scrape the thumb url off of the page. meh.
    metadict['url_to_thumb_img'] = string.replace(metadict['url_to_hires_img'], "original", "thumbnail")
    
    for data_label_cell in data_table.findAll("th"):
        try:
            label = data_label_cell.contents[0]
        except:
            continue
        if label == u"Location:":
            metadict['location'] = data_label_cell.findNextSibling("td").contents[0].strip()
        elif label == u'Photographer:':
            metadict['photographer'] = data_label_cell.findNextSibling("td").contents[0].strip()
        elif label == u'Photo Date:':
            metadict['photo_date'] = data_label_cell.findNextSibling("td").contents[0].strip()
        elif label == u'ID:':
            metadict['id'] = int(data_label_cell.findNextSibling("td").contents[0].strip())
        elif label == u'Original filename:':
            metadict['original_filename'] = data_label_cell.findNextSibling("td").contents[0].strip()
        elif label == u'Size:':
            metadict['size'] = data_label_cell.findNextSibling("td").contents[0].strip()
        elif label == u'Dimensions:':
            metadict['dimensions'] = data_label_cell.findNextSibling("td").contents[0].strip()
        elif label == u'Categories:':
            categories_td = data_label_cell.findNextSibling("td")
            categories_list = filter(lambda x: isinstance(x,unicode), categories_td.contents)
            categories_list = map(lambda x: x.strip(), categories_list)
            categories_list = filter(lambda x: len(x) > 0, categories_list)
            metadict['categories'] = categories_list
        elif label == u'Disasters:':
            disasters_td = data_label_cell.findNextSibling("td")
            disaster_links = disasters_td.findAll("a")
            disaster_tuples = map(lambda link: (link.contents[0],link['href']), disaster_links)
            metadict['disasters'] = disaster_tuples
    return metadict
Ejemplo n.º 37
0
def test_parser_reparse():
    data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
    pad = 10240 - len(data) + 1
    data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
    assert len(data) == 10240  # Sanity
    stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
    assert 'windows-1252' == stream.charEncoding[0].name
    p = HTMLParser(namespaceHTMLElements=False)
    doc = p.parse(data, useChardet=False)
    assert 'utf-8' == p.documentEncoding
    assert doc.find(".//title").text == "Caf\u00E9"
Ejemplo n.º 38
0
def sanitize_html(html):
    """Sanitizes an HTML fragment."""
    p = HTMLParser(tokenizer=HTMLSanitizer,
                   tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False,
                                  quote_attr_values=True)
    output_generator = s.serialize(stream)
    return u''.join(output_generator)
Ejemplo n.º 39
0
def sanitize_html(html):
    """Sanitizes an HTML fragment."""
    p = HTMLParser(tokenizer=HTMLSanitizer,
                   tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False,
                                  quote_attr_values=True)
    output_generator = s.serialize(stream)
    return u''.join(output_generator)
Ejemplo n.º 40
0
def test_parser_reparse():
    data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
    pad = 10240 - len(data) + 1
    data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
    assert len(data) == 10240  # Sanity
    stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
    assert 'windows-1252' == stream.charEncoding[0].name
    p = HTMLParser(namespaceHTMLElements=False)
    doc = p.parse(data, useChardet=False)
    assert 'utf-8' == p.documentEncoding
    assert doc.find(".//title").text == "Caf\u00E9"
Ejemplo n.º 41
0
 def encodingTest(self, data=test['data'], 
                  encoding=test['encoding']):
     p = HTMLParser()
     t = p.parse(data, useChardet=False)
     
     errorMessage = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n"%
                     (data, repr(encoding.lower()), 
                      repr(p.tokenizer.stream.charEncoding)))
     self.assertEquals(encoding.lower(),
                       p.tokenizer.stream.charEncoding[0], 
                       errorMessage)
Ejemplo n.º 42
0
def scraper2(request):
    #url = 'http://oracle-www.dartmouth.edu/dart/groucho/timetable.display_courses'
    #parameters = 'crnl=no_value&distribradio=alldistribs&depts=no_value&periods=no_value&distribs=no_value&distribs_i=no_value&distribs_wc=no_value&pmode=public&term=&levl=&fys=n&wrt=n&pe=n&review=n&classyear=2008&searchtype=Subject+Area%28s%29&termradio=selectterms&terms=no_value&terms=201303&subjectradio=allsubjects&hoursradio=allhours&sortorder=dept'

    #headers = {"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        #"Accept-Encoding":"gzip, deflate",
        #"Accept-Language":"en-US,en;q=0.5",
        #"Connection":"keep-alive",
        #"Host":"oracle-www.dartmouth.edu",
        #"Referer":"http://oracle-www.dartmouth.edu/dart/groucho/timetable.subject_search",
        #"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:19.0) Gecko/20100101 Firefox/19.0"
    #}
	
    #req = urllib2.Request(url, parameters, headers)
    #response = urllib2.urlopen(req)

    #html = response.read()
    ## parse for the dept and course number
    #parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    #soup = parser.parse(html)

    # no need to keep scraping.
    html = open('spring2013_scraped.txt', 'r')
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(html)
    tbody = soup.find('th', text='Term').parent.parent.parent

    parsed = tbody.findAll('tr')

    # loop through each item 
    for i in range(1, len(parsed)):
        cells = parsed[i]('td')

        # sometimes the dept isn't hyperlinked
        try:
            subj = cells[2].contents[0].contents[0]
        except AttributeError:
            subj = cells[2].contents[0]

        coursenum = cells[3].contents[0]
        title = cells[6].contents[0].contents[0]

        # sometimes there is a listing like Tu 3:00PM-6:00PM.
        try: 
            period = cells[8].contents[0].contents[0]
        except AttributeError:
            period = cells[8].contents[0]

        print subj
        print coursenum
        print title
        print period

    return render_to_response("scraper.html", {'soup': title})
Ejemplo n.º 43
0
def parse_img_html_page(html):
    if not html or html == '':
        print "wait, the page appears blank. abort mission!"
        return None
    metadict = init_dict()
    # soupify the html
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(html)
    if not soup:
        print "wait, we couldn't make a soup. i don't know WHY..."
        return None
    
    try:
        metadict['id'] = int(soup.find('input', {'type':'hidden', 'name': 'CISOPTR'})['value'])
    except:
        favorite_link_href = soup.find("a", {"title": u"Add to My Favorites"})['href']
        the_split = favorite_link_href.split("'")
        the_split.pop()
        metadict['id'] = int(the_split.pop())

    #TODO: this is kinda hackey but probably fine
    metadict['url_to_thumb_img'] = u'http://digitalmedia.fws.gov/cgi-bin/thumbnail.exe?CISOROOT=/natdiglib&CISOPTR=' + str(metadict['id'])

    hires_link = soup.find(text=lambda str: str.strip() == u'(Full Resolution Image Link)', recursive=True).parent.find('a')
    metadict['url_to_hires_img'] = hires_link['href']
    try:
        metadict['url_to_lores_img'] = u'http://digitalmedia.fws.gov' + soup.find("img", {"id" : "imagexy"})['src']
    except:
        metadict['url_to_lores_img'] = u'http://digitalmedia.fws.gov' + soup.find("input", {"type" : "image"})['src']

    data_table = soup.find("table", {"style": "border-top: 1px solid #cccccc"}).find("tbody")
    parsed_tuples = []
    for data_label_cell in data_table.findAll("td", {"width": "150"}):
        try:
            label = get_text_within(data_label_cell)
            print label
        except:
            continue
        data_cell = data_label_cell.findNextSibling("td")
        if label == 'Subject':
            data = data_cell.findAll(text=True)
        else:
            data = get_text_within(data_cell).strip()
        parsed_tuples.append((label, data))
    # now we have a list of tuples of the parsed metadata

    print parsed_tuples
    for label, data in parsed_tuples:
        field_key = data_schema.get_field_key_by_full_name(label)
        if not field_key:
            continue
        metadict[field_key] = data
        
    return metadict
Ejemplo n.º 44
0
 def encodingTest(self, data=test['data'], 
                  encoding=test['encoding']):
     p = HTMLParser()
     t = p.parse(data, useChardet=False)
     
     errorMessage = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n"%
                     (data, repr(encoding.lower()), 
                      repr(p.tokenizer.stream.charEncoding)))
     self.assertEquals(encoding.lower(),
                       p.tokenizer.stream.charEncoding[0], 
                       errorMessage)
Ejemplo n.º 45
0
def schoolscrape(categoryurl, name, url):

    print ""
    print name

    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(specialscrape(url))

    # pre = "{http://www.w3.org/1999/xhtml}"
    pre = ""

    keyvaluepairs = {}

    def addkeyvaluepair(k, v):
        keyvaluepairs[k] = v
        print k + ": " + v

    data_rows = [
        t
        for t in page.findall(path(["body", "div", "div", "div", "div"], pre))
        if t.attrib.get("class", "") == "detailsRow"
    ]

    for row in data_rows:
        key = [
            t for t in row.findall(path(["span"], pre))
            if t.attrib.get("class", "") == "leftColumn"
        ][0].text.rstrip(": ")
        valuetag = [
            t for t in row.findall(path(["span"], pre))
            if t.attrib.get("class", "") == "rightColumn"
        ][0]
        if valuetag.text:
            if key == "Address":
                raw_address = [valuetag.text] + [
                    br.tail for br in valuetag.findall(path(["br"], pre))
                ]
                addkeyvaluepair("Address", " / ".join(raw_address[:-1]))
                addkeyvaluepair("Postcode", raw_address[-1])
            else:
                addkeyvaluepair(key, valuetag.text)
        else:
            links = valuetag.findall(path(["a"], pre))
            if len(links) == 1:
                addkeyvaluepair(key, links[0].attrib["href"])
            else:
                for link in links:
                    href = link.attrib["href"]
                    if href[:7] != "http://":
                        href = categoryurl + "details/" + href
                    addkeyvaluepair(link.text, href)

    datastore.save(unique_keys=["Name"], data=keyvaluepairs)
Ejemplo n.º 46
0
  def parse_adminkun(self, html, keyword_tag):
    """
    日本語
    HTMLをパースし、全ての連載記事を取得し辞書配列として返す。
    html: パース対象のhtmlコンテンツ
    keyword_tag: 新着かバックナンバーかを指定するタグ名
    """
    logging.info(u"### キーワード:%s.", keyword_tag)

    strip_strings = ["<br>", "<br />", "<b>", "</b>"]
    for s in strip_strings:
      html = html.replace(s, "")


    try:
      # まずBeautifulSoupでパースして、
      soup = BeautifulSoup(html)
    except:
      # エラーが発生したらhtml5libでパースする
      print '### Exception: Could not parse by BeautifulSoup!!'
      parser = HTMLParser(tree = treebuilders.getTreeBuilder("beautifulsoup"))
      soup = parser.parse(html)

    data = []
    hash = {}
    for node in soup.findAll('div', {'class': keyword_tag}):
      for tag in node('table', {'width': '100%', 'cellpadding': '3', 'border': '0'}):
        body_url = tag.a['href']

        e = body_url.split('/')
        serial_number = e[4]

        index_title = u'%s' % (tag.a.next)

        if int(serial_number) <= 73:
          index_overview = tag.font.next
        else:
          index_overview = tag('font')[1].next

        index_image_url = tag.img['src']

        hash = {
          "serial_number": serial_number,
          "index_title": index_title,
          "index_overview": index_overview,
          "index_image_url": index_image_url,
          "body_url": body_url
        }
        data.append(hash)
        if keyword_tag == 'newbox':
          break
    # data.reverse()
    return data
def from_tiddler(uri, handle):
    """
    generates Tiddler from a Cook-style .tiddler file
    """
    content = handle.read().decode('utf-8', 'replace')
    content = _escape_brackets(content)

    parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
    dom = parser.parse(content)
    node = dom.getElementsByTagName('div')[0]

    return _get_tiddler_from_div(node)
Ejemplo n.º 48
0
def from_tiddler(handle):
    """
    generates a tiddler from a Cook-style .tiddler file
    """
    content = handle.read().decode('utf-8', 'replace')
    content = _escape_brackets(content)

    parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
    dom = parser.parse(content)
    node = dom.getElementsByTagName('div')[0]

    return _get_tiddler_from_div(node)
Ejemplo n.º 49
0
def test_maintain_duplicate_attribute_order():
    # This is here because we impl it in parser and not tokenizer
    p = HTMLParser()
    attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
    token = {'name': 'html',
             'selfClosing': False,
             'selfClosingAcknowledged': False,
             'type': tokenTypes["StartTag"],
             'data': attrs + [('a', len(attrs))]}
    out = p.normalizeToken(token)
    attr_order = list(out["data"].keys())
    assert attr_order == [x for x, i in attrs]
Ejemplo n.º 50
0
Archivo: o2.py Proyecto: d-fens/smspie
	def message(self, phonenumber, message):
		"""
		Sends a message to the recipient.
		"""
		params = [
			('APIID', 'AUTH-WEBSSO'),
			('TargetApp', 'o2om_smscenter_new.osp?MsgContentID=-1&SID=_'),
			('utm_source', 'dashboard_webtext_link'),
			('utm_medium', 'link'),
			('utm_campaign', 'o2_dashboard'),
		]
		self.get('http://messaging.o2online.ie/ssomanager.osp', params)

		params = [
			('MsgContentID', '-1'),
			('SID', '_'),
			('SID', '8240509_utusnutl')
		]
		handle = self.get('http://messaging.o2online.ie/o2om_smscenter_new.osp', params)

		result = re.search('Number of free text messages remaining this month: <strong>(?P<remaining>\d+)</strong>', handle, re.IGNORECASE)
		if result:
			self.remaning = result.group('remaining')

		from html5lib import HTMLParser, treebuilders
		parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
		soup = parser.parse(handle)

		form = soup.find("form", {"name": "frmSMS"})
		if not form:
			self.logger.info("no form")
			return False
		inputs = form.findAll("input", {"type": "Hidden"})
		if not inputs:
			self.logger.info("no inputs")
			return False

		post = []
		for i in inputs:
			post.append((i.get('name'), i.get('value')))
		post.append(('SMSTo', phonenumber))
		post.append(('selcountry', '00355'))
		post.append(('SMSText', message))

		params = [
			('MsgContentID', '-1'),
			('SID', '_'),
			('SID', '8240509_utusnutl')
		]
		handle = self.post('http://messaging.o2online.ie/smscenter_send.osp', post, params)

		return False
Ejemplo n.º 51
0
    def strict_validator(self):
        """
        Strict validation method.

        We just call html5lib parser with strict=True. Error messages are awful,
        and it complaints about many small errors, so it can be annoying.
        """

        strict_parser = HTMLParser(strict=True)
        try:
            strict_parser.parse(self.data)
        except ParseError as ex:
            raise ValidationError(str(ex))
Ejemplo n.º 52
0
def test_maintain_attribute_order():
    # This is here because we impl it in parser and not tokenizer
    p = HTMLParser()
    # generate loads to maximize the chance a hash-based mutation will occur
    attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
    token = {'name': 'html',
             'selfClosing': False,
             'selfClosingAcknowledged': False,
             'type': tokenTypes["StartTag"],
             'data': attrs}
    out = p.normalizeToken(token)
    attr_order = list(out["data"].keys())
    assert attr_order == [x for x, i in attrs]
Ejemplo n.º 53
0
def test_maintain_duplicate_attribute_order():
    # This is here because we impl it in parser and not tokenizer
    p = HTMLParser()
    attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
    token = {
        'name': 'html',
        'selfClosing': False,
        'selfClosingAcknowledged': False,
        'type': tokenTypes["StartTag"],
        'data': attrs + [('a', len(attrs))]
    }
    out = p.normalizeToken(token)
    attr_order = list(out["data"].keys())
    assert attr_order == [x for x, i in attrs]
Ejemplo n.º 54
0
def extractMonthlyData(d):
    print "Date: " + d

    url = "http://www.tax.state.ak.us/programs/oil/production/ans.aspx?" + d

    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(urlopen(url))

    for r in page.findall("body/form/div/div/div/div/table/tbody/tr"):
        l = list(c.text for c in r.findall("td"))
        d = processDate(l[0])
        if d:
            l[0] = d
            data = dict(zip(fields, l))
            datastore.save(unique_keys=["Date"], data=data)
Ejemplo n.º 55
0
def test_literals(app, status, warning):
    app.build()

    with (app.outdir / 'literals.html').open() as html_file:
        etree = HTMLParser(namespaceHTMLElements=False).parse(html_file)

    for code_element in etree.iter('code'):
        code_text = ''.join(code_element.itertext())

        if code_text.startswith('code role'):
            assert "'quotes'" in code_text
        elif code_text.startswith('{'):
            assert code_text == "{'code': 'role', 'with': 'quotes'}"
        elif code_text.startswith('literal'):
            assert code_text == "literal with 'quotes'"
Ejemplo n.º 56
0
def test_maintain_attribute_order():
    # This is here because we impl it in parser and not tokenizer
    p = HTMLParser()
    # generate loads to maximize the chance a hash-based mutation will occur
    attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
    token = {
        'name': 'html',
        'selfClosing': False,
        'selfClosingAcknowledged': False,
        'type': tokenTypes["StartTag"],
        'data': attrs
    }
    out = p.normalizeToken(token)
    attr_order = list(out["data"].keys())
    assert attr_order == [x for x, i in attrs]
Ejemplo n.º 57
0
def main():
    url = "http://www.tax.state.ak.us/programs/oil/production.aspx"

    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(urlopen(url))

    dates = []

    for o in page.findall("body/form/div/div/div/div/select/option"):
        d = o.attrib["value"]
        if processDate(d):
            dates.append(d)

    for d in dates:
        extractMonthlyData(d)
Ejemplo n.º 58
0
    def __init__(self, html):
        """Create a parse tree from the given HTML."""
        def really_parse_fragment(parser, html):
            """Parse a possibly multi-rooted HTML fragment, wrapping it in a
            <div> to make it easy to query later.

            As far as I can tell, this is what parseFragment is supposed to do
            (but doesn't). See
            http://code.google.com/p/html5lib/issues/detail?id=161.

            """
            top_level_elements = parser.parseFragment(html)
            container = Element(self.CONTAINER_TAG)

            # Why lxml couldn't just have text nodes, I'll never understand.
            # Text nodes that come other than first are automatically stuffed
            # into the tail attrs of the preceding elements by html5lib.
            if top_level_elements and isinstance(top_level_elements[0],
                                                 basestring):
                container.text = top_level_elements.pop(0)

            container.extend(top_level_elements)
            return container

        p = HTMLParser(tree=getTreeBuilder(self.TREEBUILDER))
        self._root = really_parse_fragment(p, html)