def news(): global url global ns global headers opener = urllib2.build_opener() opener.addheaders = headers pagetext = opener.open(url) parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(pagetext) main = page.find("//%sdiv[@class='centre-wide-main-content-column']"%ns) for entry in main.findall("%sdiv"%ns): title = entry.find("%sdiv[@class='news-item news-title']"%ns).text.strip() number = int(filter(lambda c: c in string.digits, (entry.attrib.get("onclick","0")))) url = "http://www.guernseyfc.com/news.details.php?id=%d&random=%s"%(number,ourhash(number)) head_tag = entry.find("%sdiv[@class='news-item news-brief-descript']/%stable/%stbody/%str/%std/%sh1"%(ns,ns,ns,ns,ns,ns)) if head_tag is None: head = "" else: head = head_tag.text scraperwiki.sqlite.save(unique_keys=["number"],data={"title":title, "number":number, "url":url, "head":head})
def test_parser_encoding(data, encoding): p = HTMLParser() assert p.documentEncoding is None p.parse(data, useChardet=False) encoding = encoding.lower().decode("ascii") assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
def wiki_string_to_tiddlers(content): """ Turn a string that is a wiki into tiddler. """ parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom')) doc = parser.parse(content) # minidom will not provide working getElementById without # first having a valid document, which means some very specific # doctype hooey. So we traverse body = doc.getElementsByTagName('body')[0] body_divs = body.getElementsByTagName('div') is_wiki = False for div in body_divs: if div.hasAttribute('id') and div.getAttribute('id') == 'storeArea': divs = div.getElementsByTagName('div') is_wiki = True break if is_wiki: tiddlers = [] for tiddler_div in divs: tiddlers.append(_get_tiddler_from_div(tiddler_div)) return tiddlers else: raise ValueError('content not a tiddlywiki 2.x')
def login(self, username, password): """ Login to o2online.ie Returns true if successful or false if fails. """ if self.resumable(): self.logger.info("Resuming from login.") return True else: self.logger.info("Unable to resume, running connect from login.") self.connect() post = [ ('IDButton', 'Go'), ('org', 'o2ext'), ('CONNECTFORMGET', 'TRUE'), ('IDToken1', username), ('IDToken2', password) ] handle = self.post('https://www.o2online.ie/amserver/UI/Login', post) from html5lib import HTMLParser, treebuilders parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(handle) if unicode(soup.html.head.title.string).strip() == u"LoginCheck": self.logger.info("login has correct HTML title.") return True return False
def extract_html_urls(self, html): """ Take all ``<img src="..">`` from the HTML """ p = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom = p.parse(html) urls = [] for img in dom.getElementsByTagName('img'): src = img.getAttribute('src') if src: urls.append(unquote_utf8(src)) srcset = img.getAttribute('srcset') if srcset: urls += self.extract_srcset(srcset) for source in dom.getElementsByTagName('source'): srcset = source.getAttribute('srcset') if srcset: urls += self.extract_srcset(srcset) for source in dom.getElementsByTagName('a'): href = source.getAttribute('href') if href: urls.append(unquote_utf8(href)) return urls
def shallow_scrape(): urns = set([]) br = mechanize.Browser() resultspage = br.open("http://www.education.gov.uk/edubase/quickSearchResult.xhtml") moreWorkToDo = True c = 1 while moreWorkToDo and (c<3): print "Handling page %d..."%c ### extract data from page parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(resultspage) for u in page.getroot().findall(path(["body","div","div","div","div","table","tbody","tr","td","table","tbody","tr","td","a"],"")): #href = u.attrib.get("href","") href = u.get("href") print "href: %s"%href urn = re.search("urn=([0-9]{6})",href).group(1) urns.add(urn) print "%s, "%urn print ### get new page try: resultspage = br.follow_link(text="Next") c += 1 except mechanize.LinkNotFoundError: moreWorkToDo = False return urns
def scraper(request): post_data = { 'classyear' : '2008', # why?? 'subj': 'COSC', 'crsenum': '50' } url = 'http://oracle-www.dartmouth.edu/dart/groucho/timetable.course_quicksearch' # scrape the html cj = cookielib.LWPCookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) headers = {'User-agent' : 'Mozilla/c.0 (compatible; MSIE 5.5; Windows NT)'} request = urllib2.Request(url, urllib.urlencode(post_data), headers) handle = urllib2.urlopen(request) html = handle.read() # parse for the dept and course number parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) #tbody = soup.find('th', text='Term').parent.parent.parent #soup = tbody.findAll('tr')[2]('td') return render_to_response("scraper.html", {'soup': soup})
def get_spaces_available(dept_abbr, course_num): # define post_data = { 'classyear' : '2008', #don't know WHY!?! 'subj': dept_abbr, 'crsenum': course_num, } url = 'http://oracle-www.dartmouth.edu/dart/groucho/timetable.course_quicksearch' # get the html cj = cookielib.LWPCookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) headers = {'User-agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} request = urllib2.Request(url, urllib.urlencode(post_data), headers) handle = urllib2.urlopen(request) html = handle.read() # parse the html parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) tbody = soup.find('th', text='Term').parent.parent.parent cells = tbody.findAll('tr')[2]('td') enrolled = int(cells[-2].contents[0]) capacity = int(cells[-3].contents[0]) print "%i spaces left (capacity of %i with %i enrolled)" % (capacity-enrolled, capacity, enrolled)
def get_dom(self, buf): buf = buf.strip() if not buf: return None p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), tokenizer=self.token_sanitizer()) return p.parseFragment(buf)
def clean_html(input, sanitize=False): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. :param sanitize: Remove unwanted HTML tags and attributes. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ parser_kwargs = {} serializer_kwargs = {} if sanitize: if HTMLSanitizer is None: # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016) serializer_kwargs["sanitize"] = True else: parser_kwargs["tokenizer"] = HTMLSanitizer p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs) return "".join(s.serialize(stream))
def runParserEncodingTest(data, encoding): p = HTMLParser() assert p.documentEncoding is None p.parse(data, useChardet=False) encoding = encoding.lower().decode("ascii") assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
def __init__(self, api='etree'): # if no default implementation is defined for this api, set it to None # to let getTreeBuilder() using the corresponding implementation. implementation = self.defaults.get(api, None) HTMLParser.__init__(self, tree=treebuilders.getTreeBuilder( api, implementation))
def html_parser(html): try: soup = BeautifulSoup(html) except: parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) return soup
def runParserEncodingTest(data, encoding): p = HTMLParser() p.parse(data, useChardet=False) encoding = encoding.lower().decode("ascii") assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage( data, encoding, p.tokenizer.stream.charEncoding[0])
def wiki_string_to_tiddlers(content): """ Turn a string that is a TiddlyWiki into individual tiddlers. """ parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom')) doc = parser.parse(content) # minidom will not provide working getElementById without # first having a valid document, which means some very specific # doctype hooey. So we traverse body = doc.getElementsByTagName('body')[0] body_divs = body.getElementsByTagName('div') is_wiki = False for div in body_divs: if div.hasAttribute('id') and div.getAttribute('id') == 'storeArea': divs = div.getElementsByTagName('div') is_wiki = True break if is_wiki: tiddlers = [] for tiddler_div in divs: tiddlers.append(_get_tiddler_from_div(tiddler_div)) return tiddlers else: raise ValueError('content not a tiddlywiki 2.x')
def summary_scrape(urn): print " - summary" url = "http://www.edubase.gov.uk/establishment/summary.xhtml?urn=" + urn parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(scrape(url)) keyvaluepairs = table_extract(page) raw_address = [x.strip() for x in keyvaluepairs.pop("").split(" / ")] if postcode.match(raw_address[-1]): keyvaluepairs["Postcode"] = raw_address[-1] raw_address = raw_address[:-1] keyvaluepairs["Address"] = " / ".join(raw_address) for t in page.findall( path([ "body", "div", "div", "div", "div", "table", "tbody", "tr", "td", "h1" ], pre)): x = t.text.split(": ") keyvaluepairs[x[0]] = x[1] for t in page.findall( path([ "body", "div", "div", "div", "div", "table", "tbody", "tr", "td", "div", "p", "b" ], pre)): keyvaluepairs[t.text.strip().strip(":")] = (t.tail or "").strip() return keyvaluepairs
def do_year(y, url): pagetext = urllib2.urlopen(url) parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), tokenizer=sanitizer.HTMLSanitizer) page = parser.parse(pagetext) for section in page.findall( "body/div/div/div/div/div/div/div/div/table[@class='fixture']"): matchtype = section.find("caption").text for match in section.findall("tbody/tr"): l = list(match.getchildren()) d = {} d["Match type"] = matchtype d["Match number"] = l[0].text d["Date"] = make_date(l[1].text, y) d["Team 1"] = flatten_refs(l[3]) d["Team 2"] = flatten_refs(l[5]) a = l[4].find("a") d["Score"] = a.text d["Report"] = "http://www.fifa.com" + a.get("href") print "%d (%s) %s - %s" % (y, d["Match type"], d["Team 1"], d["Team 2"]) datastore.save(unique_keys=["Date", "Team 1", "Team 2"], data=d)
def runParserEncodingTest(data, encoding): p = HTMLParser() assert p.documentEncoding is None p.parse(data, useChardet=False) encoding = encoding.lower().decode('ascii') assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
def clean_html(input, sanitize=False): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. :param sanitize: Remove unwanted HTML tags and attributes. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ parser_kwargs = {} serializer_kwargs = {} if sanitize: if HTMLSanitizer is None: # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016) serializer_kwargs['sanitize'] = True else: parser_kwargs['tokenizer'] = HTMLSanitizer p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs) return "".join(s.serialize(stream))
def test_debug_log(): parser = HTMLParser(debug=True) parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e") expected = [('dataState', 'InitialPhase', 'InitialPhase', 'processDoctype', {'type': 'Doctype'}), ('dataState', 'BeforeHtmlPhase', 'BeforeHtmlPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}), ('dataState', 'BeforeHeadPhase', 'BeforeHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}), ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}), ('rcdataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}), ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'title', 'type': 'EndTag'}), ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}), ('dataState', 'AfterHeadPhase', 'AfterHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}), ('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}), ('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}), ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})] if PY2: for i, log in enumerate(expected): log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log] expected[i] = tuple(log) assert parser.log == expected
def extract_html_urls(self, html): """ Take all ``<img src="..">`` from the HTML """ p = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom = p.parse(html) urls = [] for img in dom.getElementsByTagName("img"): src = img.getAttribute("src") if src: urls.append(unquote_utf8(src)) srcset = img.getAttribute("srcset") if srcset: urls += self.extract_srcset(srcset) for source in dom.getElementsByTagName("source"): srcset = source.getAttribute("srcset") if srcset: urls += self.extract_srcset(srcset) for source in dom.getElementsByTagName("a"): href = source.getAttribute("href") if href: urls.append(unquote_utf8(href)) return urls
def test_parser_args(expected, data, kwargs): stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs) assert expected == stream.charEncoding[0].name p = HTMLParser() p.parse(data, useChardet=False, **kwargs) assert expected == p.documentEncoding
def get_first_result_index_from_quick_search_results(html): parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) block = soup.find('', {'id' : 'photoresult'}) # isolate the table of data on the first result block = block.findAll('', {'class' : 'photobox'})[0] id = block.find('p').find('a').contents[0] id = int(id) return id
def parse(f): p = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) doc = p.parse(f) walker = treewalkers.getTreeWalker("dom") tokens = [] bintokens = [] waitfor = None for tok in walker(doc): if waitfor: if tok["type"] == waitfor[0] and tok["name"] == waitfor[1]: waitfor = None continue if tok["type"] == "StartTag" and tok["name"] in ("link", "script", "style"): waitfor = ("EndTag", tok["name"]) if tok["type"] in ("EndTag", "StartTag", "EmptyTag", "Comment"): bintokens.append(1) tokens.append(tok) elif tok["type"] in ("Characters",): for tok1 in tok["data"].split(): bintokens.append(0) tokens.append({"type": "Characters", "data": tok1}) elif tok["type"] in ("SpaceCharacters", "Doctype"): pass else: raise ValueError("unrecognizable token type: %r" % tok) cumbintokens = [bintokens[0]] for tok in bintokens[1:]: cumbintokens.append(cumbintokens[-1] + tok) length = len(cumbintokens) midx = None m = None for i in range(length): for j in range(i + 1, length): end_tag = cumbintokens[-1] - cumbintokens[j] start_tag = cumbintokens[i] text_between = (j - i) - (cumbintokens[j] - cumbintokens[i]) nm = end_tag + start_tag + text_between if not midx or nm > m: midx = i, j m = nm i, j = midx return serialize_tokens(tokens[i:j + 1])
def test_productionlist(app, status, warning): app.builder.build_all() warnings = warning.getvalue().split("\n") assert len(warnings) == 2 assert warnings[-1] == '' assert "Dup2.rst:4: WARNING: duplicate token description of Dup, other instance in Dup1" in warnings[ 0] with (app.outdir / 'index.html').open('rb') as f: etree = HTMLParser(namespaceHTMLElements=False).parse(f) ul = list(etree.iter('ul'))[1] cases = [] for li in list(ul): assert len(list(li)) == 1 p = list(li)[0] assert p.tag == 'p' text = str(p.text).strip(' :') assert len(list(p)) == 1 a = list(p)[0] assert a.tag == 'a' link = a.get('href') assert len(list(a)) == 1 code = list(a)[0] assert code.tag == 'code' assert len(list(code)) == 1 span = list(code)[0] assert span.tag == 'span' linkText = span.text.strip() cases.append((text, link, linkText)) assert cases == [ ('A', 'Bare.html#grammar-token-A', 'A'), ('B', 'Bare.html#grammar-token-B', 'B'), ('P1:A', 'P1.html#grammar-token-P1-A', 'P1:A'), ('P1:B', 'P1.html#grammar-token-P1-B', 'P1:B'), ('P2:A', 'P1.html#grammar-token-P1-A', 'P1:A'), ('P2:B', 'P2.html#grammar-token-P2-B', 'P2:B'), ('Explicit title A, plain', 'Bare.html#grammar-token-A', 'MyTitle'), ('Explicit title A, colon', 'Bare.html#grammar-token-A', 'My:Title'), ('Explicit title P1:A, plain', 'P1.html#grammar-token-P1-A', 'MyTitle'), ('Explicit title P1:A, colon', 'P1.html#grammar-token-P1-A', 'My:Title'), ('Tilde A', 'Bare.html#grammar-token-A', 'A'), ('Tilde P1:A', 'P1.html#grammar-token-P1-A', 'A'), ('Tilde explicit title P1:A', 'P1.html#grammar-token-P1-A', '~MyTitle'), ('Tilde, explicit title P1:A', 'P1.html#grammar-token-P1-A', 'MyTitle'), ('Dup', 'Dup2.html#grammar-token-Dup', 'Dup'), ('FirstLine', 'firstLineRule.html#grammar-token-FirstLine', 'FirstLine'), ('SecondLine', 'firstLineRule.html#grammar-token-SecondLine', 'SecondLine'), ] text = (app.outdir / 'LineContinuation.html').read_text() assert "A</strong> ::= B C D E F G" in text
def scrape_others(pct_name, url): types = ["doctor", "dentist", "pharmacy", "optician"] for facility_type, i in zip(types, range(2, 6)): parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(scrape(url + "&v=%d" % i)) root = page.getroot() s = root.find("body/div/form/div/div/div/div/div/dl") extract_table_data(pct_name, s, facility_type)
def scrape_pct(link, pct_name): """ Scrapes the data associated with the PCT, and calls functions to scrape data associated with the services. """ print print print pct_name print "-" * len(pct_name) url = "http://www.nhs.uk" + link parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(scrape(url)) root = page.getroot() d = {} # basic contact details d["PCT"] = pct_name d["type"] = "main" d["name"] = pct_name address = root.find("body/div/form/div/div/p").text d["address"] = address postcode = geo.extract_gb_postcode(address) d["postcode"] = postcode d["latlng"] = geo.gb_postcode_to_latlng(postcode) d["info HTML"] = url # quality for t in root.findall( "body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']" ): k = t.find("div/h4").text.strip() v = t.find("div/img").attrib["alt"] d[k] = v # head honcho for t in root.findall( "body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']" ): d["Boss"] = t.text.replace("<br />", ", ") # boring text for t in root.findall("body/div/form/div/div/div/div/div/div/p"): if t.text: if t.attrib.get("class", False) == "intro": d["intro text"] = t.text else: d["boilerplate"] = d.get("boilerplate", "") + "\n" + t.text datastore.save(unique_keys=["PCT", "type", "name", "address"], data=d, latlng=d.get("latlng")) scrape_facilities(pct_name, root) scrape_others(pct_name, url)
def body_html(self): body_html = self.get_part_content(self.mail_pyzmail.html_part) if not body_html and self.body_text: body_html = self.body_text.replace('\n', '<br />') parser = HTMLParser(tokenizer=HTMLSanitizer) parser.parse(body_html) return body_html
def scrape_others(pct_name,url): types = ["doctor","dentist","pharmacy","optician"] for facility_type,i in zip(types,range(2,6)): parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(scrape(url+"&v=%d"%i)) root = page.getroot() s = root.find("body/div/form/div/div/div/div/div/dl") extract_table_data(pct_name,s,facility_type)
def sanitize(content): parser = HTMLParser(tokenizer = sanitizer.HTMLSanitizer, tree = treebuilders.getTreeBuilder("dom")) dom = parser.parseFragment(content) tree_walker = treewalkers.getTreeWalker("dom") tree_stream = tree_walker(dom) serial = serializer.HTMLSerializer(omit_optional_tags = False, quote_attr_values = True) output = serial.serialize(tree_stream) return u''.join(output)
def _parse_and_linkify(self, content, dest_class): p = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) doc = p.parse(content) for node in doc.iter(): self._link_finder(node) docstr = etree_tostring(doc, encoding=unicode) html_fragment = dest_class(docstr) return html_fragment
def get_first_result_index_from_quick_search_results(html): parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) block = soup.find(border="0", bgcolor="white") # isolate the table of data on the first result id_str = block.find('font').contents[0] #contents of first <font> # this should looke like: 'ID#:11901' # parse out the actual id and cast as int id = int(id_str.partition(':')[2]) print id return id
def get_first_result_index_from_quick_search_results(html): parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) block = soup.find( '', {'id': 'photoresult'}) # isolate the table of data on the first result block = block.findAll('', {'class': 'photobox'})[0] id = block.find('p').find('a').contents[0] id = int(id) return id
def parse_img_html_page(html): if not html or html == '': print "wait, something terrible has happened. abort mission!" return None metadict = init_dict() # soupify the html parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) if not soup: print "wait, we couldn't make a soup. i don't know WHY..." return None # the lores image url metadict['url_to_lores_img'] = soup.find("",{"class": "tophoto"}).find("img")["src"] # the description/caption metadict['desc'] = encode_all_nice(soup.find("", {"class": "caption"}).contents[0].strip()) # html table with the rest of the data data_table = soup.find("", {"class": "photoinfo2"}).find("tbody") metadict['url_to_hires_img'] = find_by_tag_and_contents(data_table, "a", u"\u00BB Download original photo")['href'] # TODO: for now, we just assume that the thumb image url follows a pattern # maybe we should really do a search for this image's id and scrape the thumb url off of the page. meh. metadict['url_to_thumb_img'] = string.replace(metadict['url_to_hires_img'], "original", "thumbnail") for data_label_cell in data_table.findAll("th"): try: label = data_label_cell.contents[0] except: continue if label == u"Location:": metadict['location'] = data_label_cell.findNextSibling("td").contents[0].strip() elif label == u'Photographer:': metadict['photographer'] = data_label_cell.findNextSibling("td").contents[0].strip() elif label == u'Photo Date:': metadict['photo_date'] = data_label_cell.findNextSibling("td").contents[0].strip() elif label == u'ID:': metadict['id'] = int(data_label_cell.findNextSibling("td").contents[0].strip()) elif label == u'Original filename:': metadict['original_filename'] = data_label_cell.findNextSibling("td").contents[0].strip() elif label == u'Size:': metadict['size'] = data_label_cell.findNextSibling("td").contents[0].strip() elif label == u'Dimensions:': metadict['dimensions'] = data_label_cell.findNextSibling("td").contents[0].strip() elif label == u'Categories:': categories_td = data_label_cell.findNextSibling("td") categories_list = filter(lambda x: isinstance(x,unicode), categories_td.contents) categories_list = map(lambda x: x.strip(), categories_list) categories_list = filter(lambda x: len(x) > 0, categories_list) metadict['categories'] = categories_list elif label == u'Disasters:': disasters_td = data_label_cell.findNextSibling("td") disaster_links = disasters_td.findAll("a") disaster_tuples = map(lambda link: (link.contents[0],link['href']), disaster_links) metadict['disasters'] = disaster_tuples return metadict
def test_parser_reparse(): data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8') pad = 10240 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 10240 # Sanity stream = inputstream.HTMLBinaryInputStream(data, chardet=False) assert 'windows-1252' == stream.charEncoding[0].name p = HTMLParser(namespaceHTMLElements=False) doc = p.parse(data, useChardet=False) assert 'utf-8' == p.documentEncoding assert doc.find(".//title").text == "Caf\u00E9"
def sanitize_html(html): """Sanitizes an HTML fragment.""" p = HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(html) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) output_generator = s.serialize(stream) return u''.join(output_generator)
def test_parser_reparse(): data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8') pad = 10240 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 10240 # Sanity stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False) assert 'windows-1252' == stream.charEncoding[0].name p = HTMLParser(namespaceHTMLElements=False) doc = p.parse(data, useChardet=False) assert 'utf-8' == p.documentEncoding assert doc.find(".//title").text == "Caf\u00E9"
def encodingTest(self, data=test['data'], encoding=test['encoding']): p = HTMLParser() t = p.parse(data, useChardet=False) errorMessage = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n"% (data, repr(encoding.lower()), repr(p.tokenizer.stream.charEncoding))) self.assertEquals(encoding.lower(), p.tokenizer.stream.charEncoding[0], errorMessage)
def scraper2(request): #url = 'http://oracle-www.dartmouth.edu/dart/groucho/timetable.display_courses' #parameters = 'crnl=no_value&distribradio=alldistribs&depts=no_value&periods=no_value&distribs=no_value&distribs_i=no_value&distribs_wc=no_value&pmode=public&term=&levl=&fys=n&wrt=n&pe=n&review=n&classyear=2008&searchtype=Subject+Area%28s%29&termradio=selectterms&terms=no_value&terms=201303&subjectradio=allsubjects&hoursradio=allhours&sortorder=dept' #headers = {"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", #"Accept-Encoding":"gzip, deflate", #"Accept-Language":"en-US,en;q=0.5", #"Connection":"keep-alive", #"Host":"oracle-www.dartmouth.edu", #"Referer":"http://oracle-www.dartmouth.edu/dart/groucho/timetable.subject_search", #"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:19.0) Gecko/20100101 Firefox/19.0" #} #req = urllib2.Request(url, parameters, headers) #response = urllib2.urlopen(req) #html = response.read() ## parse for the dept and course number #parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) #soup = parser.parse(html) # no need to keep scraping. html = open('spring2013_scraped.txt', 'r') parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) tbody = soup.find('th', text='Term').parent.parent.parent parsed = tbody.findAll('tr') # loop through each item for i in range(1, len(parsed)): cells = parsed[i]('td') # sometimes the dept isn't hyperlinked try: subj = cells[2].contents[0].contents[0] except AttributeError: subj = cells[2].contents[0] coursenum = cells[3].contents[0] title = cells[6].contents[0].contents[0] # sometimes there is a listing like Tu 3:00PM-6:00PM. try: period = cells[8].contents[0].contents[0] except AttributeError: period = cells[8].contents[0] print subj print coursenum print title print period return render_to_response("scraper.html", {'soup': title})
def parse_img_html_page(html): if not html or html == '': print "wait, the page appears blank. abort mission!" return None metadict = init_dict() # soupify the html parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) if not soup: print "wait, we couldn't make a soup. i don't know WHY..." return None try: metadict['id'] = int(soup.find('input', {'type':'hidden', 'name': 'CISOPTR'})['value']) except: favorite_link_href = soup.find("a", {"title": u"Add to My Favorites"})['href'] the_split = favorite_link_href.split("'") the_split.pop() metadict['id'] = int(the_split.pop()) #TODO: this is kinda hackey but probably fine metadict['url_to_thumb_img'] = u'http://digitalmedia.fws.gov/cgi-bin/thumbnail.exe?CISOROOT=/natdiglib&CISOPTR=' + str(metadict['id']) hires_link = soup.find(text=lambda str: str.strip() == u'(Full Resolution Image Link)', recursive=True).parent.find('a') metadict['url_to_hires_img'] = hires_link['href'] try: metadict['url_to_lores_img'] = u'http://digitalmedia.fws.gov' + soup.find("img", {"id" : "imagexy"})['src'] except: metadict['url_to_lores_img'] = u'http://digitalmedia.fws.gov' + soup.find("input", {"type" : "image"})['src'] data_table = soup.find("table", {"style": "border-top: 1px solid #cccccc"}).find("tbody") parsed_tuples = [] for data_label_cell in data_table.findAll("td", {"width": "150"}): try: label = get_text_within(data_label_cell) print label except: continue data_cell = data_label_cell.findNextSibling("td") if label == 'Subject': data = data_cell.findAll(text=True) else: data = get_text_within(data_cell).strip() parsed_tuples.append((label, data)) # now we have a list of tuples of the parsed metadata print parsed_tuples for label, data in parsed_tuples: field_key = data_schema.get_field_key_by_full_name(label) if not field_key: continue metadict[field_key] = data return metadict
def schoolscrape(categoryurl, name, url): print "" print name parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(specialscrape(url)) # pre = "{http://www.w3.org/1999/xhtml}" pre = "" keyvaluepairs = {} def addkeyvaluepair(k, v): keyvaluepairs[k] = v print k + ": " + v data_rows = [ t for t in page.findall(path(["body", "div", "div", "div", "div"], pre)) if t.attrib.get("class", "") == "detailsRow" ] for row in data_rows: key = [ t for t in row.findall(path(["span"], pre)) if t.attrib.get("class", "") == "leftColumn" ][0].text.rstrip(": ") valuetag = [ t for t in row.findall(path(["span"], pre)) if t.attrib.get("class", "") == "rightColumn" ][0] if valuetag.text: if key == "Address": raw_address = [valuetag.text] + [ br.tail for br in valuetag.findall(path(["br"], pre)) ] addkeyvaluepair("Address", " / ".join(raw_address[:-1])) addkeyvaluepair("Postcode", raw_address[-1]) else: addkeyvaluepair(key, valuetag.text) else: links = valuetag.findall(path(["a"], pre)) if len(links) == 1: addkeyvaluepair(key, links[0].attrib["href"]) else: for link in links: href = link.attrib["href"] if href[:7] != "http://": href = categoryurl + "details/" + href addkeyvaluepair(link.text, href) datastore.save(unique_keys=["Name"], data=keyvaluepairs)
def parse_adminkun(self, html, keyword_tag): """ 日本語 HTMLをパースし、全ての連載記事を取得し辞書配列として返す。 html: パース対象のhtmlコンテンツ keyword_tag: 新着かバックナンバーかを指定するタグ名 """ logging.info(u"### キーワード:%s.", keyword_tag) strip_strings = ["<br>", "<br />", "<b>", "</b>"] for s in strip_strings: html = html.replace(s, "") try: # まずBeautifulSoupでパースして、 soup = BeautifulSoup(html) except: # エラーが発生したらhtml5libでパースする print '### Exception: Could not parse by BeautifulSoup!!' parser = HTMLParser(tree = treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) data = [] hash = {} for node in soup.findAll('div', {'class': keyword_tag}): for tag in node('table', {'width': '100%', 'cellpadding': '3', 'border': '0'}): body_url = tag.a['href'] e = body_url.split('/') serial_number = e[4] index_title = u'%s' % (tag.a.next) if int(serial_number) <= 73: index_overview = tag.font.next else: index_overview = tag('font')[1].next index_image_url = tag.img['src'] hash = { "serial_number": serial_number, "index_title": index_title, "index_overview": index_overview, "index_image_url": index_image_url, "body_url": body_url } data.append(hash) if keyword_tag == 'newbox': break # data.reverse() return data
def from_tiddler(uri, handle): """ generates Tiddler from a Cook-style .tiddler file """ content = handle.read().decode('utf-8', 'replace') content = _escape_brackets(content) parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom')) dom = parser.parse(content) node = dom.getElementsByTagName('div')[0] return _get_tiddler_from_div(node)
def from_tiddler(handle): """ generates a tiddler from a Cook-style .tiddler file """ content = handle.read().decode('utf-8', 'replace') content = _escape_brackets(content) parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom')) dom = parser.parse(content) node = dom.getElementsByTagName('div')[0] return _get_tiddler_from_div(node)
def test_maintain_duplicate_attribute_order(): # This is here because we impl it in parser and not tokenizer p = HTMLParser() attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))] token = {'name': 'html', 'selfClosing': False, 'selfClosingAcknowledged': False, 'type': tokenTypes["StartTag"], 'data': attrs + [('a', len(attrs))]} out = p.normalizeToken(token) attr_order = list(out["data"].keys()) assert attr_order == [x for x, i in attrs]
def message(self, phonenumber, message): """ Sends a message to the recipient. """ params = [ ('APIID', 'AUTH-WEBSSO'), ('TargetApp', 'o2om_smscenter_new.osp?MsgContentID=-1&SID=_'), ('utm_source', 'dashboard_webtext_link'), ('utm_medium', 'link'), ('utm_campaign', 'o2_dashboard'), ] self.get('http://messaging.o2online.ie/ssomanager.osp', params) params = [ ('MsgContentID', '-1'), ('SID', '_'), ('SID', '8240509_utusnutl') ] handle = self.get('http://messaging.o2online.ie/o2om_smscenter_new.osp', params) result = re.search('Number of free text messages remaining this month: <strong>(?P<remaining>\d+)</strong>', handle, re.IGNORECASE) if result: self.remaning = result.group('remaining') from html5lib import HTMLParser, treebuilders parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(handle) form = soup.find("form", {"name": "frmSMS"}) if not form: self.logger.info("no form") return False inputs = form.findAll("input", {"type": "Hidden"}) if not inputs: self.logger.info("no inputs") return False post = [] for i in inputs: post.append((i.get('name'), i.get('value'))) post.append(('SMSTo', phonenumber)) post.append(('selcountry', '00355')) post.append(('SMSText', message)) params = [ ('MsgContentID', '-1'), ('SID', '_'), ('SID', '8240509_utusnutl') ] handle = self.post('http://messaging.o2online.ie/smscenter_send.osp', post, params) return False
def strict_validator(self): """ Strict validation method. We just call html5lib parser with strict=True. Error messages are awful, and it complaints about many small errors, so it can be annoying. """ strict_parser = HTMLParser(strict=True) try: strict_parser.parse(self.data) except ParseError as ex: raise ValidationError(str(ex))
def test_maintain_attribute_order(): # This is here because we impl it in parser and not tokenizer p = HTMLParser() # generate loads to maximize the chance a hash-based mutation will occur attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))] token = {'name': 'html', 'selfClosing': False, 'selfClosingAcknowledged': False, 'type': tokenTypes["StartTag"], 'data': attrs} out = p.normalizeToken(token) attr_order = list(out["data"].keys()) assert attr_order == [x for x, i in attrs]
def test_maintain_duplicate_attribute_order(): # This is here because we impl it in parser and not tokenizer p = HTMLParser() attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))] token = { 'name': 'html', 'selfClosing': False, 'selfClosingAcknowledged': False, 'type': tokenTypes["StartTag"], 'data': attrs + [('a', len(attrs))] } out = p.normalizeToken(token) attr_order = list(out["data"].keys()) assert attr_order == [x for x, i in attrs]
def extractMonthlyData(d): print "Date: " + d url = "http://www.tax.state.ak.us/programs/oil/production/ans.aspx?" + d parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(urlopen(url)) for r in page.findall("body/form/div/div/div/div/table/tbody/tr"): l = list(c.text for c in r.findall("td")) d = processDate(l[0]) if d: l[0] = d data = dict(zip(fields, l)) datastore.save(unique_keys=["Date"], data=data)
def test_literals(app, status, warning): app.build() with (app.outdir / 'literals.html').open() as html_file: etree = HTMLParser(namespaceHTMLElements=False).parse(html_file) for code_element in etree.iter('code'): code_text = ''.join(code_element.itertext()) if code_text.startswith('code role'): assert "'quotes'" in code_text elif code_text.startswith('{'): assert code_text == "{'code': 'role', 'with': 'quotes'}" elif code_text.startswith('literal'): assert code_text == "literal with 'quotes'"
def test_maintain_attribute_order(): # This is here because we impl it in parser and not tokenizer p = HTMLParser() # generate loads to maximize the chance a hash-based mutation will occur attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))] token = { 'name': 'html', 'selfClosing': False, 'selfClosingAcknowledged': False, 'type': tokenTypes["StartTag"], 'data': attrs } out = p.normalizeToken(token) attr_order = list(out["data"].keys()) assert attr_order == [x for x, i in attrs]
def main(): url = "http://www.tax.state.ak.us/programs/oil/production.aspx" parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(urlopen(url)) dates = [] for o in page.findall("body/form/div/div/div/div/select/option"): d = o.attrib["value"] if processDate(d): dates.append(d) for d in dates: extractMonthlyData(d)
def __init__(self, html): """Create a parse tree from the given HTML.""" def really_parse_fragment(parser, html): """Parse a possibly multi-rooted HTML fragment, wrapping it in a <div> to make it easy to query later. As far as I can tell, this is what parseFragment is supposed to do (but doesn't). See http://code.google.com/p/html5lib/issues/detail?id=161. """ top_level_elements = parser.parseFragment(html) container = Element(self.CONTAINER_TAG) # Why lxml couldn't just have text nodes, I'll never understand. # Text nodes that come other than first are automatically stuffed # into the tail attrs of the preceding elements by html5lib. if top_level_elements and isinstance(top_level_elements[0], basestring): container.text = top_level_elements.pop(0) container.extend(top_level_elements) return container p = HTMLParser(tree=getTreeBuilder(self.TREEBUILDER)) self._root = really_parse_fragment(p, html)