def __init__(self, guess=None, filename=None, url=None, file_obj=None, string=None, tree=None, encoding=None, base_url=None, url_fetcher=default_url_fetcher, media_type='print'): result = _select_source( guess, filename, url, file_obj, string, tree, base_url, url_fetcher) with result as (source_type, source, base_url, protocol_encoding): if source_type == 'tree': result = source else: if isinstance(source, unicode): result = html5lib.parse( source, treebuilder='lxml', namespaceHTMLElements=False) else: result = html5lib.parse( source, treebuilder='lxml', override_encoding=encoding, transport_encoding=protocol_encoding, namespaceHTMLElements=False) assert result base_url = find_base_url(result, base_url) if hasattr(result, 'getroot'): result.docinfo.URL = base_url result = result.getroot() else: result.getroottree().docinfo.URL = base_url self.root_element = result self.base_url = base_url self.url_fetcher = url_fetcher self.media_type = media_type
def iter_datatable(session, url, **kwargs): url += '&numResults=1000&startIndex=0' l = blackboard.slowlog() response = session.get(url) if kwargs.pop('edit_mode', False): response = session.ensure_edit_mode(response) l("Fetching datatable page 1 took %.4f s") history = list(response.history) + [response] document = html5lib.parse(response.content, encoding=response.encoding) keys, rows = parse_datatable(response, document, **kwargs) yield keys yield from rows next_id = 'listContainer_nextpage_top' next_o = document.find('.//h:a[@id="%s"]' % next_id, NS) page_number = 1 while next_o: page_number += 1 url = urljoin(response.url, next_o.get('href')) l = blackboard.slowlog() response = session.get(url) l("Fetching datatable page %d took %.4f s", page_number) history += list(response.history) + [response] document = html5lib.parse(response.content, encoding=response.encoding) keys_, rows = parse_datatable(response, document, **kwargs) if keys != keys_: raise ValueError( "Page %d keys (%r) do not match page 1 keys (%r)" % (page_number, keys_, keys)) next_o = document.find('.//h:a[@id="%s"]' % next_id, NS) yield from rows response.history = history[:-1] yield response
def test_index_html(request_factory): with request_factory(matched_route='home', map=None) as req: for rsc in _RESOURCES: if not hasattr(rsc.model, 'first') or not rsc.with_index: continue dt = req.get_datatable(rsc.name + 's', rsc.model) res = render('%s/index_html.mako' % rsc.name, {'ctx': dt}, request=req) html5lib.parse(res)
def test_detail_html(self): self.set_request_properties(matched_route=Route(), map=None) for rsc in _RESOURCES: if not hasattr(rsc.model, 'first'): continue res = render( '%s/detail_html.mako' % rsc.name, dict(ctx=rsc.model.first()), request=self.env['request']) html5lib.parse(res)
def test_index_html(self): self.set_request_properties(matched_route=Route(), map=None) for rsc in _RESOURCES: if not hasattr(rsc.model, 'first') or not rsc.with_index: continue dt = self.env['request'].get_datatable(rsc.name + 's', rsc.model) res = render( '%s/index_html.mako' % rsc.name, dict(ctx=dt), request=self.env['request']) html5lib.parse(res)
def test_detail_html(request_factory): with request_factory(matched_route='home', map=None) as req: for rsc in _RESOURCES: if not hasattr(rsc.model, 'first'): continue res = render( '%s/detail_html.mako' % rsc.name, {'ctx': rsc.model.first()}, request=req) html5lib.parse(res) if rsc.name == 'dataset': assert 'http://example.org/privacy' in res assert 'Privacy Policy' in res
def huge_bench_html5lib_etree(files): etree_doc = html5lib.parse(files['template.html']) assert etree_doc.tag == '{http://www.w3.org/1999/xhtml}html' print(' template done;') etree_doc2 = html5lib.parse(files['spec.html']) assert etree_doc2.tag == '{http://www.w3.org/1999/xhtml}html' print(' spec done;') etree_doc3 = html5lib.parse(files['py33_py34.html']) assert etree_doc3.tag == '{http://www.w3.org/1999/xhtml}html' print(' py33_py34 done.')
def test_html5lib(count, spec_data): # No warm-up runs for this benchmark; in real life, the parser doesn't get # to warm up (this isn't a daemon process). times = [] for _ in range(count): spec_data.seek(0) t0 = time.time() html5lib.parse(spec_data) t1 = time.time() times.append(t1 - t0) return times
def huge_bench_html5lib_dom(files): print('html5lib dom:') dom_doc = html5lib.parse(files['template.html'], treebuilder="dom") assert len(dom_doc.toxml()) > 1024 print(' template done;') dom_doc2 = html5lib.parse(files['spec.html'], treebuilder="dom") assert len(dom_doc2.toxml()) > 1024 print(' spec done;') dom_doc3 = html5lib.parse(files['py33_py34.html'], treebuilder="dom") assert len(dom_doc3.toxml()) > 1024 print(' py33_py34 done.')
def huge_bench_html5lib_lxml(files): print('html5lib lxml:') lxml_doc = html5lib.parse(files['template.html'], treebuilder="lxml") assert lxml_doc.getroot().tag == '{http://www.w3.org/1999/xhtml}html' print(' template done;') lxml_doc2 = html5lib.parse(files['spec.html'], treebuilder="lxml") assert lxml_doc2.getroot().tag == '{http://www.w3.org/1999/xhtml}html' print(' spec done;') lxml_doc3 = html5lib.parse(files['py33_py34.html'], treebuilder="lxml") assert lxml_doc3.getroot().tag == '{http://www.w3.org/1999/xhtml}html' print(' py33_py34 done.')
def fetch_data(download_dir): # Fetch the index page to get a CSRF token. r = requests.get('https://www.runtastic.com/') if r.status_code != 200: raise 'Sucks' cookies = dict(r.cookies) doc = html5lib.parse(r.text, treebuilder='dom') csrf = get_csrf_token(doc) # Now log in. user, pw = read_user_pass() login = dict(csrf) login['user[email]'] = user login['user[password]'] = pw r2 = requests.post('https://www.runtastic.com/en/d/users/sign_in.json', data=login, cookies=cookies) if r2.status_code != 200: raise 'Sucks 2' cookies.update(r2.cookies) j = r2.json() if not j['success']: raise 'Login failed' doc = html5lib.parse(j['update'], treebuilder='dom') # Find the sport-sessions page and fetch it to get a User ID # and a list of session IDs. links = [l.getAttribute('href') for l in doc.getElementsByTagName('a') if l.getAttribute('href').endswith('/sport-sessions')] sessions_url = urlparse.urljoin(r2.url, links[0]) r3 = requests.get(sessions_url, cookies=cookies) if r3.status_code != 200: raise 'Sucks 3' cookies.update(r3.cookies) doc = html5lib.parse(r3.text, treebuilder='dom') uid = get_user_id(doc) data = get_data(doc) # Now hit the API to get data about each session. request_data = dict(csrf) request_data['user_id'] = uid request_data['items'] = ','.join(str(d[0]) for d in data) r4 = requests.post('https://www.runtastic.com/api/run_sessions/json', cookies=cookies, data=request_data) if r4.status_code != 200: raise 'Sucks 4' cookies.update(r4.cookies) sessions = r4.json() known_sessions = read_known_sessions() for s in sessions: if s['id'] in known_sessions: continue if check_download_session(urlparse.urljoin(r4.url, s['page_url']) + '.tcx', download_dir, cookies): known_sessions.add(s['id']) write_known_sessions(known_sessions)
def parse_html(file_obj_or_str, url=None): '''Discovers various metadata URLs embedded in a given HTML document, such as feeds and RDF. :param file_obj_or_str: The HTML document to be parsed. :type file_obj_or_str: a file-like object or :class:`str` :param url: The URL that the HTML document was retrieved from. :type url: :class:`str` or :const:`None` :returns: A dictionary, where the key is the URL's MIME type and the value is a dictionary of URL-title pairs. :rtype: :class:`dict` ''' urls = {} # load the modules only when the function is first called. if not _MODULE_CACHE: for name, module in MODULES.iteritems(): mod = import_module(module, package=__name__) try: _MODULE_CACHE[name] = mod.Discoverer except AttributeError: raise AttributeError('''\ Could not find a Discoverer object in the %s module.''' % name) doc = html5lib.parse(file_obj_or_str, treebuilder='lxml') print type(doc) for name, discoverer in _MODULE_CACHE.iteritems(): urls[name] = discoverer.parse(doc, url=url) return urls
def cleanup_html(html): """Cleanups malformed and wrongly-encoded HTML. Returns UTF-8 encoded well-formed HTML""" h = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False) stream = StringIO() h.write(stream, encoding='utf-8') return stream.getvalue()
def test_to_sax(): handler = support.TracingSaxHandler() tree = html5lib.parse("""<html xml:lang="en"> <title>Directory Listing</title> <a href="/"><b/></p> """, treebuilder="etree") walker = getTreeWalker("etree") sax.to_sax(walker(tree), handler) expected = [ 'startDocument', ('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html', {(None, 'xml:lang'): 'en'}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}), ('characters', 'Directory Listing'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'), ('characters', '\n '), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'), ('characters', '\n '), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'), 'endDocument', ] assert expected == handler.visited
def _parse_content(self, content): document = html5lib.parse(content, treebuilder="lxml", namespaceHTMLElements=False) for tbl in document.xpath('.//table'): ths = tbl.xpath('.//tr//th') if len(ths) == 0 or ths[0].text != 'Auction Date': continue for row in tbl.xpath('.//tr')[1:]: row_data = row.xpath('td') if len(row_data) == 0 or row_data[0].text.strip() == '': continue str_good_date = sub('([0-9]*)[a-z]*( [A-z]* [0-9]*)',r'\1\2',row_data[0].text.strip()) dtt = datetime.strptime(str_good_date, "%d %B %Y").date() auction_info = {'date': dtt, 'average_price': _convert_type(row_data[1].text.strip()[1:], 'float'), 'lowest_price': _convert_type(row_data[2].text.strip()[1:], 'float'), 'total_volume': _convert_type(row_data[3].text or '0', 'int'), 'co_fired_volume': _convert_type(row_data[4].text or '0', 'int'), 'period': "{}{:02d}".format(dtt.year, dtt.month)} self.auctions.append(auction_info) for info in self.auctions: if info['period'] in self.periods: previous = self.periods[info['period']] if not isinstance(previous, list): self.periods[info['period']] = [info['average_price'], previous] else: self.periods[info['period']].append(info['average_price']) else: self.periods[info['period']] = info['average_price'] for key in self.periods.keys(): if isinstance(self.periods[key], list): self.periods[key] = sum(self.periods[key]) / len(self.periods[key]) return True
def assertADWithRawHTML(self, params, expected, template=None): if not template: template = HTML5_TEMPLATE html = template % params doc = html5lib.parse(html, treebuilder='lxml') feeds = self.parse_with_base(doc) self.assertEqual(feeds, expected)
def metadata(self) -> str: """해당 비디오의 메타데이터를 반환합니다. :return: 비디오의 메타데이터 :rtype: :class:`str` """ if self._metadata: return self._metadata response = urlopen(self.to_url) if response.status != 200: raise Exception('status_code: {}, url: {}'.format( response.status, self.to_url)) html = html5lib.parse(response.readall().decode('utf-8'), namespaceHTMLElements=False) xpath_builder = "./head//meta[@{attr}='{value}']".format query = [ ('property', 'og:title'), ('property', 'og:image'), ('name', 'keywords') ] for attr, value in query: elem = html.findall(xpath_builder(attr=attr, value=value)) if elem: self.metadata[value] = elem[0].get('content') self._metadata.setdefault(value, None) return self._metadata
def parse_comments(self, root, raw): ans = '' ns = tuple(self.selector('#bookDescription_feature_div noscript')) if ns: ns = ns[0] if len(ns) == 0 and ns.text: import html5lib # html5lib parsed noscript as CDATA ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0] else: ns.tag = 'div' ans = self._render_comments(ns) else: desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') if desc: ans = self._render_comments(desc[0]) desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') if desc: ans += self._render_comments(desc[0]) else: # Idiot chickens from amazon strike again. This data is now stored # in a JS variable inside a script tag URL encoded. m = re.search(b'var\s+iframeContent\s*=\s*"([^"]+)"', raw) if m is not None: try: text = unquote(m.group(1)).decode('utf-8') nr = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False) desc = nr.xpath('//div[@id="productDescription"]/*[@class="content"]') if desc: ans += self._render_comments(desc[0]) except Exception as e: self.log.warn('Parsing of obfuscated product description failed with error: %s' % as_unicode(e)) return ans
def __init__(self, filename, test_type, parent): self.url = parent.session.config.server.url(filename) self.type = test_type self.variants = [] # Some tests are reliant on the WPT servers substitution functionality, # so tests must be retrieved from the server rather than read from the # file system directly. handle = urllib.request.urlopen(self.url, context=parent.session.config.ssl_context) try: markup = handle.read() finally: handle.close() if test_type not in TEST_TYPES: raise ValueError('Unrecognized test type: "%s"' % test_type) parsed = html5lib.parse(markup, namespaceHTMLElements=False) name = None includes_variants_script = False self.expected = None for element in parsed.getiterator(): if not name and element.tag == 'title': name = element.text continue if element.tag == 'meta' and element.attrib.get('name') == 'variant': self.variants.append(element.attrib.get('content')) continue if element.tag == 'script': if element.attrib.get('id') == 'expected': self.expected = json.loads(text_type(element.text)) src = element.attrib.get('src', '') if 'variants.js' in src: includes_variants_script = True if not resolve_uri(filename, src): raise ValueError('Could not resolve path "%s" from %s' % (src, filename)) if not name: raise ValueError('No name found in file: %s' % filename) elif self.type == 'functional': if not self.expected: raise ValueError('Functional tests must specify expected report data') if not includes_variants_script: raise ValueError('No variants script found in file: %s' % filename) if len(self.variants) == 0: raise ValueError('No test variants specified in file %s' % filename) elif self.type == 'unit' and self.expected: raise ValueError('Unit tests must not specify expected report data') # Ensure that distinct items have distinct fspath attributes. # This is necessary because pytest has an internal cache keyed on it, # and only the first test with any given fspath will be run. # # This cannot use super(HTMLItem, self).__init__(..) because only the # Collector constructor takes the fspath argument. pytest.Item.__init__(self, name, parent) pytest.Collector.__init__(self, name, parent, fspath=py.path.local(filename))
def get_courses(args, parameter, value): filename = 'courses-%s-%s.json' % (parameter, value) if args.cached and os.path.exists(filename): with open(filename) as fp: return json.load(fp) url = ( 'http://kursuskatalog.au.dk/coursecatalog/Course/ajaxsearch/' + '?tx_aucoursecatalog_pi1[%s]=%s') % (parameter, value) logger.debug('Retrieve %r', url) r = requests.get(url) logger.debug('%d bytes', len(r.content)) document = html5lib.parse(r.content, encoding=r.encoding) ns = {'h': 'http://www.w3.org/1999/xhtml'} path = ".//h:tbody/h:tr" rows = document.findall(path, ns) courses = [] for row in rows: path = "./h:td" cells = row.findall(path, ns) cellTexts = [' '.join(''.join(cell.itertext()).split()) for cell in cells] courseCell = cells[0] path = './h:a' courseLink = courseCell.find(path, ns) coursePath = courseLink.get('href') course = dict(zip( 'courseName level ects period courseLanguage institut'.split(), cellTexts)) course['link'] = coursePath courses.append(course) with open(filename, 'w') as fp: json.dump(courses, fp, indent=2) return courses
def scrape_registrar(course, url, css_query, max_capacity): try: r = requests.get(url) except: print("Connection reset. Retrying.") send_email('Connection reset', 'Retrying') return raw_html = r.text page = html5lib.parse(raw_html, treebuilder='lxml', namespaceHTMLElements=False) selector = lxml.cssselect.CSSSelector(css_query) match = selector(page) status = match[0].text print('current: %s, capcity: %s' % (status, max_capacity)) if status != max_capacity: subj = '%s is availible, sign up now!\n' % course body = 'https://be.my.ucla.edu/ClassPlanner/ClassPlan.aspx' print(subj) send_email(subj, body) return True else: print('%s is still closed :(\n' % course) return False
def get_top_result(title, artist): r = requests.get(BASE_URL,params={'q':title}) raw_html = r.text page = html5lib.parse(raw_html, treebuilder='lxml', namespaceHTMLElements=False) # Find result with closest matching artist css_query = '.sen b' selector = lxml.cssselect.CSSSelector(css_query) match = selector(page) top_match = 100 index = 0 count = 0 for m in match: if m.text is not None and m.text.upper() == m.text: lev_dist = levenshtein(m.text.lower(), artist.lower()) if lev_dist < top_match: top_match = lev_dist index = count count+=1 if top_match < 5: css_query = '.sen a' selector = lxml.cssselect.CSSSelector(css_query) match = selector(page) if index < len(match): return ''.join([char for char in match[index].get('href')]) return None
def get_image_urls(self, title, author, log, abort, timeout): from calibre.utils.cleantext import clean_ascii_chars from urllib import urlencode import html5lib import json from collections import OrderedDict ans = OrderedDict() br = self.browser q = urlencode({'as_q': ('%s %s'%(title, author)).encode('utf-8')}).decode('utf-8') sz = self.prefs['size'] if sz == 'any': sz = '' elif sz == 'l': sz = 'isz:l,' else: sz = 'isz:lt,islt:%s,' % sz # See https://www.google.com/advanced_image_search to understand this # URL scheme url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz) log('Search URL: ' + url) raw = br.open(url).read().decode('utf-8') root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) for div in root.xpath('//div[@class="rg_meta"]'): try: data = json.loads(div.text) except Exception: continue if 'ou' in data: ans[data['ou']] = True return list(ans.iterkeys())
def __init__(self, *args, **kwargs): self.__url__ = None self.__doc__ = None self.__parsed__ = {"items": [], "rels": {}} if len(args) > 0: if type(args[0]) is file: # load file self.__doc__ = html5lib.parse(args[0], treebuilder="dom") if len(args) > 1 and (type(args[1]) is str or type(args[1]) is unicode): self.__url__ = args[1] #TODO: parse this properly elif type(args[0]) is str or type(args[0]) is unicode: pass # load URL # test for base if self.__doc__ is not None and self.__url__ is None: poss_bases = self.__doc__.getElementsByTagName("base") actual_base = None if len(poss_bases) is not 0: for poss_base in poss_bases: if poss_base.hasAttribute("href"): # check to see if absolute if urlparse(poss_base.getAttribute("href")).netloc is not '': self.__url__ = poss_base.getAttribute("href") if self.__doc__ is not None: # parse! self.__doc__.documentElement.apply_backcompat_rules() self.parse()
def thread(data, default=u"Untitled.", id=None): """ Extract <h1> title from web page. The title is *probably* the text node, which is the nearest H1 node in context to an element with the `isso-thread` id. """ html = html5lib.parse(data, treebuilder="dom") assert html.lastChild.nodeName == "html" html = html.lastChild # aka getElementById, but limited to div and section tags el = list(filter(lambda i: i.attributes["id"].value == "isso-thread", filter(lambda i: "id" in i.attributes, chain(*map(html.getElementsByTagName, ("div", "section")))))) if not el: return id, default el = el[0] visited = [] def recurse(node): for child in node.childNodes: if child.nodeType != child.ELEMENT_NODE: continue if child.nodeName.upper() == "H1": return child if child not in visited: return recurse(child) def gettext(rv): for child in rv.childNodes: if child.nodeType == child.TEXT_NODE: yield child.nodeValue if child.nodeType == child.ELEMENT_NODE: for item in gettext(child): yield item try: id = unquote(el.attributes["data-isso-id"].value) except (KeyError, AttributeError): pass try: return id, unquote(el.attributes["data-title"].value) except (KeyError, AttributeError): pass while el is not None: # el.parentNode is None in the very end visited.append(el) rv = recurse(el) if rv: return id, ''.join(gettext(rv)).strip() el = el.parentNode return id, default
def do_parse_test(html, n): start = time.time() for i in xrange(n): tree = html5lib.parse(html) tree.name stop = time.time() print stop - start, "s"
def generate_public_uti_map(): from lxml import etree import html5lib, urllib raw = urllib.urlopen( 'https://developer.apple.com/library/ios/documentation/Miscellaneous/Reference/UTIRef/Articles/System-DeclaredUniformTypeIdentifiers.html').read() root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) tables = root.xpath('//table')[0::2] data = {} for table in tables: for tr in table.xpath('descendant::tr')[1:]: td = tr.xpath('descendant::td') identifier = etree.tostring(td[0], method='text', encoding=unicode).strip() tags = etree.tostring(td[2], method='text', encoding=unicode).strip() identifier = identifier.split()[0].replace('\u200b', '') exts = [x.strip()[1:].lower() for x in tags.split(',') if x.strip().startswith('.')] for ext in exts: data[ext] = identifier lines = ['PUBLIC_UTI_MAP = {'] for ext in sorted(data): r = ("'" + ext + "':").ljust(16) lines.append((' ' * 4) + r + "'" + data[ext] + "',") lines.append('}') with open(__file__, 'r+b') as f: raw = f.read() f.seek(0) nraw = re.sub(r'^PUBLIC_UTI_MAP = .+?}', '\n'.join(lines), raw, flags=re.MULTILINE | re.DOTALL) f.truncate(), f.write(nraw)
def scrape_category (url, title): category_slug = slugify (title) try: f = urlopen (url) except ValueError: if trace: print 'Retrying:', url url = 'http://eracks.com' + url.replace (' ','%20') if trace: print 'As:', url f = urlopen (url) doc = html5lib.parse(f, treebuilder='lxml', namespaceHTMLElements=False) # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html (doc) jQuery = PyQuery([doc]) page_title = jQuery ('title').text() if page_title.startswith ("eRacks Open Source Systems: "): page_title = page_title.partition ("eRacks Open Source Systems: ") [-1] if page_title.startswith ("eRacks "): page_title = page_title.partition ("eRacks ") [-1] content = jQuery ('td#content') links = content ('a') images = content ('img') for link in links: a = PyQuery (link) href = a.attr('href') skus = find_sku.findall (href) if skus: sku = skus [0] #a.attr ('href', '/%s/%s/' % (category_slug, slugify (sku))) a.attr ('href', '/products/%s/%s/' % (category_slug, sku)) elif href.startswith ('/Legacy'): sku = slugify (href.split ('/') [-1]) #a.attr ('href', '/%s/%s/' % (category_slug, slugify (sku))) a.attr ('href', '/products/%s/%s/' % (category_slug, sku)) print 'link:', a.attr('href') for image in images: img = PyQuery (image) src = img.attr('src') newsrc = getimage (src, 'categories/' + category_slug) img.attr ('src', newsrc) print 'image:', newsrc description = content.html() if trace: print description if dbteeth: cat = Categories.objects.get (name=title) cat.comments = cat.comments + '\n\nScraped from Zope as of ' + str(datetime.date.today()) cat.description = description cat.title = page_title cat.save() print '..saved.'
def chrome_versions(): if is_ci: return [] print('Getting chrome versions...') import html5lib raw = download_securely( 'https://en.wikipedia.org/wiki/Google_Chrome_version_history').decode('utf-8') root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) table = root.xpath('//*[@id="mw-content-text"]//tbody')[-1] ans = [] for tr in table.iterchildren('tr'): cells = tuple(tr.iterchildren('td')) if not cells: continue if not cells[2].text or not cells[2].text.strip(): continue s = cells[0].get('style') if '#a0e75a' not in s and 'salmon' not in s: break chrome_version = cells[0].text.strip() ts = datetime.strptime(cells[1].text.strip().split()[ 0], '%Y-%m-%d').date().strftime('%Y-%m-%d') try: webkit_version = cells[2].text.strip().split()[1] except IndexError: continue ans.append({'date': ts, 'chrome_version': chrome_version, 'webkit_version': webkit_version}) return list(reversed(ans))
def parse_html_to_tree(html): ''' 利用html5lib模块规范化html文档内容,从html转换成xml element dom tree ''' #TODO 通过html5lib构造的dom tree数据结构好像不靠谱 root = html5lib.parse(html, treebuilder="etree") return root
def downloadHTMLFile(url, hasToStartWith): print("Download from " + url) website = urllib.urlopen(url) t = website.read() dom = html5lib.parse(t, treebuilder="dom") lis = dom.getElementsByTagName("li") resultList = [] for l in lis: if len(l.childNodes) > 0: ll = l.childNodes[0] if len(ll.childNodes) > 0: lll = ll.childNodes[0] if lll.nodeValue == None: continue if lll.nodeValue.startswith("Lists"): break nv = lll.nodeValue if "(" in lll.nodeValue: nv = lll.nodeValue[:lll.nodeValue.find("(") - 1] print("----->" + nv) resultList.append(nv) print(lll.nodeValue) return resultList
def __init__(self, guess=None, filename=None, url=None, file_obj=None, string=None, tree=None, encoding=None, base_url=None, url_fetcher=default_url_fetcher, media_type='print'): result = _select_source(guess, filename, url, file_obj, string, tree, base_url, url_fetcher) with result as (source_type, source, base_url, protocol_encoding): if source_type == 'tree': result = source else: if not encoding: encoding = protocol_encoding if isinstance(source, unicode): encoding = None result = html5lib.parse(source, treebuilder='lxml', encoding=encoding, namespaceHTMLElements=False) assert result base_url = find_base_url(result, base_url) if hasattr(result, 'getroot'): result.docinfo.URL = base_url result = result.getroot() else: result.getroottree().docinfo.URL = base_url self.root_element = result self.base_url = base_url self.url_fetcher = url_fetcher self.media_type = media_type
def test_parse_etree(): """ Parsing a fragment to an etree produces a document root element that contains the document, including implied tags. """ doc = parse( "<!DOCTYPE html><html><title>...</title><p>...</p></html><!-- ... -->", treebuilder="etree", ) assert doc.tag == 'DOCUMENT_ROOT' [doctype, html, comment] = doc assert doctype.tag == "<!DOCTYPE>" assert doctype.text == "html" assert html.tag == "{http://www.w3.org/1999/xhtml}html" assert comment.tag is ElementTree.Comment assert comment.text == " ... " [head, body] = html assert head.tag == "{http://www.w3.org/1999/xhtml}head" assert body.tag == "{http://www.w3.org/1999/xhtml}body" [title] = head assert title.tag == "{http://www.w3.org/1999/xhtml}title" [p] = body assert p.tag == "{http://www.w3.org/1999/xhtml}p" assert p.text == "..."
def step1(self,fonts): # self.width = width # client width # self.height= height # client height self.fonts = fonts # font family self.lineNo = 0 # räknar alla rader i boken self.words = [] # unika ord. parallell med widths. self.widths = [] # ordets bredd i pixel self.index = [] # ordets index i words och widths. Ett index for varje ord i kapitlet/boken self.init() with open(DIR + '/index.html', 'r', encoding="utf-8") as f: html = f.read() document = html5lib.parse(html) self.traverse(document) if len(self.rects) > 0: pageNo = int(self.lineNo / LPP) self.lineNo = LPP * (pageNo+1) self.produceFile() self.makeChapter() self.makeWords()
# -*- coding: utf-8 -*- import html5lib print('通过指定treebuilder来解析:') document = '<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news</h1><h1 align="right">2018.8.1</h1></body></html>' #直接调用html5lib.parse来解析,解析时采用lxml构建树的方法 content = html5lib.parse(document, treebuilder="lxml", namespaceHTMLElements=False) #指定要提取的内容所在的标签路径 rows = content.xpath('/html/body/h1') for row in rows: t = row.xpath('./text()')[0] #定位到标签节点后,通过text()提取内容 print(t) print('通过指定tree来解析:') document = '<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news</h1><h1 align="right">2018.8.1</h1></body></html>' #构造HTMLParser实例,指定构造lxml的树 p = html5lib.HTMLParser(strict=False, tree=html5lib.getTreeBuilder('lxml'), namespaceHTMLElements=False) #解析HTML文档 t = p.parse(document) rows = t.xpath('/html/body/h1') for row in rows: t = row.xpath('./text()')[0] print(t) print('通过指定tree来提取超链接:') document = '<html><head><title>Test</title></head><body><a href="www.baidu.com">baidu</body></html>' p = html5lib.HTMLParser(strict=False,
def main(): parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('filename', metavar='filename', type=str, nargs='+', help='filename with HTML timetable') args = parser.parse_args() #print(args.filename) hour_list = [] multirow = False multirow_no = 0 with open(args.filename[0], 'rb') as f: doc = html5lib.parse(f) #print(doc) body = doc.find('{http://www.w3.org/1999/xhtml}body') #print(body) print() for e in body: if e.tag == '{http://www.w3.org/1999/xhtml}table': #print(e) #print(e.attrib) cl = e.attrib['class'] if cl == 'grid-border-args': tbody = e.find('{http://www.w3.org/1999/xhtml}tbody') # for each row in the table for day_no, row in enumerate(tbody.findall('{http://www.w3.org/1999/xhtml}tr')): #print(row) # for each time slot for col_no, td in enumerate(row.findall('{http://www.w3.org/1999/xhtml}td')): #print(td.text) #print(sh) if 'class' in td.attrib: cl = td.attrib['class'] if cl == 'col-label-one' or cl == 'row-label-one': #print(td.attrib) if 'rowspan' in td.attrib: if td.attrib['rowspan'] != '1': #print('Anomaly') multirow = True else: multirow = False text = td.text if td.text else ' ' if text in DAYS: max_day_len = max([ len(s) for s in DAYS]) print(td.text + ''.rjust(max_day_len - len(text)) + ': ', end=' ') else: hour_list.append(td.text) print(td.text, end=' ') # actual lecture elif cl == 'object-cell-border': #print('HODINA') hours, lecture = parse_lecture(td) for i in range(hours): #print('day_no= {}, multirow_no= {}'.format(day_no, multirow_no)) sh[day_no - multirow_no].append('X') else: # empty #print('day_no= {}, multirow_no= {}'.format(day_no, multirow_no)) sh[day_no - multirow_no].append('O') else: print(' ') print() if multirow: multirow_no += 1 print() print_schedule(hour_list, sh)
for record in f: # We convert into UTF8 first of all orig_encoding, text = convert_encoding(record.payload.read()) url = record.url if orig_encoding is None: logging.info("Encoding of document " + url + " could not be identified") if len(text) > 0: # HTML is then normalized cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) tree="" try: cleanhtml = cleaner.clean_html(re.sub('encoding *= *"[^"]+"', '', text, flags=re.IGNORECASE)) document = html5lib.parse(ftfy.fix_text(cleanhtml), treebuilder="lxml", namespaceHTMLElements=False) tree = etree.tostring(document) except: continue tree = etree.tostring(document) cleantree = tree.decode("utf8").replace(" ", " ") cleantree = cleantree.replace("\t", " ") # lang id lang = guess_lang_from_data2(cleantree) if len(languages) > 0 and lang not in languages: logging.info("Language of document " + url + ": " + lang + ". Not among searched languages.") else: # If enabled, remove boilerplate HTML if options.boilerpipe:
def parse_headlines_s(s): root = html5lib.parse(s, default_treebuilder) return parse_headlines(root)
import html5lib, sys from xml.etree import ElementTree as ET tree = html5lib.parse(sys.stdin.read().decode("utf-8"), namespaceHTMLElements=False) def filtertext(t): if t == None: return None return t.replace("\n", " ") # STOP = "script pre code style".split() PROCESS = "p sub li a em".split() def process (elt): # if elt.tag in STOP: # return # print elt.tag if elt.tag in PROCESS: elt.text = filtertext(elt.text) elt.tail = filtertext(elt.tail) for child in elt: process(child) process(tree) sys.stdout.write(ET.tostring(tree).encode("utf-8"))
def cmd_login(user, passwd): """Attempts to log into GOG and saves the resulting cookiejar to disk. """ login_data = { 'user': user, 'passwd': passwd, 'auth_url': None, 'login_token': None, 'two_step_url': None, 'two_step_token': None, 'two_step_security_code': None, 'login_success': False, } global_cookies.clear() # reset cookiejar # prompt for login/password if needed if login_data['user'] is None: login_data['user'] = input("Username: "******"attempting gog login as '{}' ...".format(login_data['user'])) # fetch the auth url with request(GOG_HOME_URL, delay=0) as page: etree = html5lib.parse(page, namespaceHTMLElements=False) for elm in etree.findall('.//script'): if elm.text is not None and 'GalaxyAccounts' in elm.text: login_data['auth_url'] = elm.text.split("'")[3] break # fetch the login token with request(login_data['auth_url'], delay=0) as page: etree = html5lib.parse(page, namespaceHTMLElements=False) # Bail if we find a request for a reCAPTCHA if len(etree.findall( './/div[@class="g-recaptcha form__recaptcha"]')) > 0: error( "cannot continue, gog is asking for a reCAPTCHA :( try again in a few minutes." ) return for elm in etree.findall('.//input'): if elm.attrib['id'] == 'login__token': login_data['login_token'] = elm.attrib['value'] break # perform login and capture two-step token if required with request(GOG_LOGIN_URL, delay=0, args={ 'login[username]': login_data['user'], 'login[password]': login_data['passwd'], 'login[login]': '', 'login[_token]': login_data['login_token'] }) as page: etree = html5lib.parse(page, namespaceHTMLElements=False) if 'two_step' in page.geturl(): login_data['two_step_url'] = page.geturl() for elm in etree.findall('.//input'): if elm.attrib['id'] == 'second_step_authentication__token': login_data['two_step_token'] = elm.attrib['value'] break elif 'on_login_success' in page.geturl(): login_data['login_success'] = True # perform two-step if needed if login_data['two_step_url'] is not None: login_data['two_step_security_code'] = input( "enter two-step security code: ") # Send the security code back to GOG with request(login_data['two_step_url'], delay=0, args={ 'second_step_authentication[token][letter_1]': login_data['two_step_security_code'][0], 'second_step_authentication[token][letter_2]': login_data['two_step_security_code'][1], 'second_step_authentication[token][letter_3]': login_data['two_step_security_code'][2], 'second_step_authentication[token][letter_4]': login_data['two_step_security_code'][3], 'second_step_authentication[send]': "", 'second_step_authentication[_token]': login_data['two_step_token'] }) as page: if 'on_login_success' in page.geturl(): login_data['login_success'] = True # save cookies on success if login_data['login_success']: info('login successful!') global_cookies.save() else: error('login failed, verify your username/password and try again.')
def html5libparse(url, f): 'html5libparse() - use html5lib to parse anchor tags' output(urljoin(url, x.attributes['href'])\ for x in parse(f) if isinstance(x, treebuilders.simpletree.Element) and \ x.name == 'a')
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): ''' Note this method will retry without identifiers automatically if no match is found with identifiers. ''' from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode from lxml.html import tostring import html5lib testing = getattr(self, 'running_a_test', False) query, domain = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if query is None: log.error('Insufficient metadata to construct query') return br = self.browser if testing: print ('Using user agent for amazon: %s'%self.user_agent) try: raw = br.open_novisit(query, timeout=timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('Query malformed: %r'%query) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = _('Amazon timed out. Try again later.') log.error(msg) else: msg = 'Failed to make identify query: %r'%query log.exception(msg) return as_unicode(msg) raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]) if testing: import tempfile with tempfile.NamedTemporaryFile(prefix='amazon_results_', suffix='.html', delete=False) as f: f.write(raw.encode('utf-8')) print ('Downloaded html for results page saved in', f.name) matches = [] found = '<title>404 - ' not in raw if found: try: root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse amazon page for query: %r'%query log.exception(msg) return msg errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = tostring(errmsg, method='text', encoding=unicode).strip() log.error(msg) # The error is almost always a not found error found = False if found: matches = self.parse_results_page(root, domain) if abort.is_set(): return if not matches: if identifiers and title and authors: log('No matches found with identifiers, retrying using only' ' title and authors. Query: %r'%query) return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) log.error('No matches found with query: %r'%query) return workers = [Worker(url, result_queue, br, log, i, domain, self, testing=testing) for i, url in enumerate(matches)] for w in workers: w.start() # Don't send all requests at the same time time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break return None
def parse(path='html5ents.xml'): return html5lib.parse(open(path), treebuilder='lxml')
def test_html5_parser(self): from html5_parser import parse parse('<p>xxx')
def parse_email_html(html_data): etree_document = html5lib.parse(html_data, treebuilder="lxml", namespaceHTMLElements=False) root = etree_document.getroot() # By inspecting HTML payloads (saved/dumped elsewhere), # (samples taken at points which the scraping threw an exception!), # # It's clear that the format in the emails is close-enough that it's easier # to write a flexible scraper, than to scrape scrictly for slight variations. # # Emails after 2014-Aug (ish) change from: # <td>BookTitle <span>By AuthorName</span></td> # to: # <td><a>$BookTitle</a> <span>By $AuthorName</span></td> # Additionally, emails after 2014-Aug (ish) no longer include # a <td>$DateOfPurchase</td>, so, this changes xpath of $Price <td/> # # Emails after 2015-Aug (ish) change the specific xpath to the items table. # # Edge case in my emails is an email (before 2014-Aug) with no <span/>, # & so no author. Okay. # General formula was # some_xpath = "/path/to/el" # some_el = root.xpath(some_xpath) # n.b. this is a list. # some_str = f(some_el) # some_el[0].text, etc. # TBH, most of the rest is "magic"/hard-coded enough (by nature) # that it's not particularly maintainable. # Scrapers should be fragile. # items_table contains all the <tr/> with order items. # items_table_xpath = "/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td[2]/table[3]/tbody/tr/td[2]/table[3]/tbody/tr/td/table[4]" items_table_xpath = "/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td[2]/table/tbody/tr/td[2]/table[3]/tbody/tr/td/table[4]" items_table = root.xpath(items_table_xpath)[0] # "/tbody..." vs "tbody..."? item_rows = items_table.xpath("tbody/tr") # print "DEBUG Num item rows: ", len(item_rows) # For individual <tr/>, return { title, author, price } def item_from_row(tr): # Because it's email, the <tr/> has a table or two inside it. Cool. title_author_td = tr.xpath("td/table/tbody/tr/td/table/tbody/tr/td[2]") # print "DEBUG Title Author TD len", len(title_author_td) # How to do things like ".getElementsByTag"? :S # Prefer BeautifulSoup for some things? a = title_author_td[0].xpath("a") if len(a) == 0: title = title_author_td[0].text else: title = a[0].text # print "DEBUG Title", title span = title_author_td[0].xpath("span") if len(span) > 0: # Get rid of the "By.." author = " ".join(span[0].text.split()[1:]) else: author = None # print "DEBUG author ", author # Price <td/> is the last one. price_td = tr.xpath("td/table/tbody/tr/td/table/tbody/tr/td")[-1] price = price_td.text print "DEBUG Kobo found '%s' by '%s' @ '%s'" % (title, author, price) return {"title": title, "author": author, "price": price} return [item_from_row(r) for r in item_rows]
def filterHtml (selectFunc, fd): document = html5lib.parse (fd) walker = html5lib.getTreeWalker("etree") stream = walker (document) s = HTMLSerializer() yield ''.join (s.serialize(Select (stream, selectFunc)))
def thread(data, default="Untitled.", id=None): """ Extract <h1> title from web page. The title is *probably* the text node, which is the nearest H1 node in context to an element with the `isso-thread` id. """ html = html5lib.parse(data, treebuilder="dom") assert html.lastChild.nodeName == "html" html = html.lastChild # aka getElementById, but limited to div and section tags el = list( filter( lambda i: i.attributes["id"].value == "isso-thread", filter( lambda i: "id" in i.attributes, chain(*map(html.getElementsByTagName, ("div", "section")))))) if not el: return id, default el = el[0] visited = [] def recurse(node): for child in node.childNodes: if child.nodeType != child.ELEMENT_NODE: continue if child.nodeName.upper() == "H1": return child if child not in visited: return recurse(child) def gettext(rv): for child in rv.childNodes: if child.nodeType == child.TEXT_NODE: yield child.nodeValue if child.nodeType == child.ELEMENT_NODE: for item in gettext(child): yield item try: id = unquote(el.attributes["data-isso-id"].value) except (KeyError, AttributeError): pass try: return id, unquote(el.attributes["data-title"].value) except (KeyError, AttributeError): pass while el is not None: # el.parentNode is None in the very end visited.append(el) rv = recurse(el) if rv: return id, ''.join(gettext(rv)).strip() el = el.parentNode return id, default
def parse(doc): return html5lib.parse(doc, namespaceHTMLElements=False)
def parse(path="html5ents.xml"): return html5lib.parse(open(path), treebuilder="lxml")
from collections import OrderedDict # This script scrapes data from BioPKU's PAHvdb. # There are 2 cookies for the page. This one is the one that is required for the script to work. cookies = {'BIOPKUCopyrightDisclaimer': '1'} # This has to be a post request. Can't do requests.get(URL). r = requests.post( 'http://biopku.org/pah/search-results-browse.asp', data={'searchType': '2'}, # This represents clicking search and browse to get to the list of all variants in the database cookies=cookies ) doc = html5lib.parse( r.text, #input HTML treebuilder='lxml', #enable xpath function namespaceHTMLElements=False #disable namespace prefixes ) links = doc.xpath('.//div[@id="container-body-wide"]//td[position()=4]//a') #links = ['result-details-pah.asp?ID=689', 'result-details-pah.asp?ID=623','result-details-pah.asp?ID=622'] #for testing purposes #links = ['result-details-pah.asp?ID=692', 'result-details-pah.asp?ID=693', 'result-details-pah.asp?ID=694', 'result-details-pah.asp?ID=733'] # Used https://pythex.org/ to check my regex link_match = re.compile("/centralstore/pah/[a-zA-Z\.\d_\-()+]*_PAH.htm[l]?") for link in links: r = requests.get('http://biopku.org/pah/' + link.attrib['href'], cookies=cookies) #r = requests.get('http://biopku.org/pah/' + link, cookies=cookies) doc = html5lib.parse(r.text, treebuilder='lxml', namespaceHTMLElements=False) rows = doc.xpath('.//div[@id="right-body"]//td') #print rows #this is blank [] for empty pages
class SourceFile(object): parsers = { "html": lambda x: html5lib.parse(x, treebuilder="etree"), "xhtml": lambda x: ElementTree.parse(x, XMLParser.XMLParser()), "svg": lambda x: ElementTree.parse(x, XMLParser.XMLParser()) } root_dir_non_test = set(["common"]) dir_non_test = set(["resources", "support", "tools"]) dir_path_non_test = {("css21", "archive"), ("css", "CSS2", "archive"), ("css", "common")} def __init__(self, tests_root, rel_path, url_base, contents=None): """Object representing a file in a source tree. :param tests_root: Path to the root of the source tree :param rel_path: File path relative to tests_root :param url_base: Base URL used when converting file paths to urls :param contents: Byte array of the contents of the file or ``None``. """ self.tests_root = tests_root if os.name == "nt": # do slash normalization on Windows if isinstance(rel_path, binary_type): self.rel_path = rel_path.replace(b"/", b"\\") else: self.rel_path = rel_path.replace(u"/", u"\\") else: self.rel_path = rel_path self.url_base = url_base self.contents = contents self.dir_path, self.filename = os.path.split(self.rel_path) self.name, self.ext = os.path.splitext(self.filename) self.type_flag = None if "-" in self.name: self.type_flag = self.name.rsplit("-", 1)[1].split(".")[0] self.meta_flags = self.name.split(".")[1:] self.items_cache = None def __getstate__(self): # Remove computed properties if we pickle this class rv = self.__dict__.copy() if "__cached_properties__" in rv: cached_properties = rv["__cached_properties__"] for key in rv.keys(): if key in cached_properties: del rv[key] del rv["__cached_properties__"] return rv def name_prefix(self, prefix): """Check if the filename starts with a given prefix :param prefix: The prefix to check""" return self.name.startswith(prefix) def is_dir(self): """Return whether this file represents a directory.""" if self.contents is not None: return False return os.path.isdir(self.rel_path) def open(self): """ Return either * the contents specified in the constructor, if any; * a File object opened for reading the file contents. """ if self.contents is not None: file_obj = ContextManagerBytesIO(self.contents) else: file_obj = open(self.path, 'rb') return file_obj @cached_property def path(self): return os.path.join(self.tests_root, self.rel_path) @cached_property def url(self): return rel_path_to_url(self.rel_path, self.url_base) @cached_property def hash(self): with self.open() as f: return hashlib.sha1(f.read()).hexdigest() def in_non_test_dir(self): if self.dir_path == "": return True parts = self.dir_path.split(os.path.sep) if (parts[0] in self.root_dir_non_test or any(item in self.dir_non_test for item in parts) or any(parts[:len(path)] == list(path) for path in self.dir_path_non_test)): return True return False def in_conformance_checker_dir(self): return (self.dir_path == "conformance-checkers" or self.dir_path.startswith("conformance-checkers" + os.path.sep)) @property def name_is_non_test(self): """Check if the file name matches the conditions for the file to be a non-test file""" return (self.is_dir() or self.name_prefix("MANIFEST") or self.filename.startswith(".") or self.type_flag == "support" or self.in_non_test_dir()) @property def name_is_conformance(self): return (self.in_conformance_checker_dir() and self.type_flag in ("is-valid", "no-valid")) @property def name_is_conformance_support(self): return self.in_conformance_checker_dir() @property def name_is_stub(self): """Check if the file name matches the conditions for the file to be a stub file""" return self.name_prefix("stub-") @property def name_is_manual(self): """Check if the file name matches the conditions for the file to be a manual test file""" return self.type_flag == "manual" @property def name_is_visual(self): """Check if the file name matches the conditions for the file to be a visual test file""" return self.type_flag == "visual" @property def name_is_multi_global(self): """Check if the file name matches the conditions for the file to be a multi-global js test file""" return "any" in self.meta_flags and self.ext == ".js" @property def name_is_worker(self): """Check if the file name matches the conditions for the file to be a worker js test file""" return "worker" in self.meta_flags and self.ext == ".js" @property def name_is_window(self): """Check if the file name matches the conditions for the file to be a window js test file""" return "window" in self.meta_flags and self.ext == ".js" @property def name_is_webdriver(self): """Check if the file name matches the conditions for the file to be a webdriver spec test file""" # wdspec tests are in subdirectories of /webdriver excluding __init__.py # files. rel_dir_tree = self.rel_path.split(os.path.sep) return (((rel_dir_tree[0] == "webdriver" and len(rel_dir_tree) > 1) or (rel_dir_tree[:2] == ["infrastructure", "webdriver"] and len(rel_dir_tree) > 2)) and self.filename not in ("__init__.py", "conftest.py") and fnmatch(self.filename, wd_pattern)) @property def name_is_reference(self): """Check if the file name matches the conditions for the file to be a reference file (not a reftest)""" return "/reference/" in self.url or "/reftest/" in self.url or bool( reference_file_re.search(self.name)) @property def markup_type(self): """Return the type of markup contained in a file, based on its extension, or None if it doesn't contain markup""" ext = self.ext if not ext: return None if ext[0] == ".": ext = ext[1:] if ext in ["html", "htm"]: return "html" if ext in ["xhtml", "xht", "xml"]: return "xhtml" if ext == "svg": return "svg" return None @cached_property def root(self): """Return an ElementTree Element for the root node of the file if it contains markup, or None if it does not""" if not self.markup_type: return None parser = self.parsers[self.markup_type] with self.open() as f: try: tree = parser(f) except Exception: return None if hasattr(tree, "getroot"): root = tree.getroot() else: root = tree return root @cached_property def timeout_nodes(self): """List of ElementTree Elements corresponding to nodes in a test that specify timeouts""" return self.root.findall( ".//{http://www.w3.org/1999/xhtml}meta[@name='timeout']") @cached_property def script_metadata(self): if self.name_is_worker or self.name_is_multi_global or self.name_is_window: regexp = js_meta_re elif self.name_is_webdriver: regexp = python_meta_re else: return None with self.open() as f: return list(read_script_metadata(f, regexp)) @cached_property def timeout(self): """The timeout of a test or reference file. "long" if the file has an extended timeout or None otherwise""" if self.script_metadata: if any(m == (b"timeout", b"long") for m in self.script_metadata): return "long" if self.root is None: return None if self.timeout_nodes: timeout_str = self.timeout_nodes[0].attrib.get("content", None) if timeout_str and timeout_str.lower() == "long": return "long" return None @cached_property def viewport_nodes(self): """List of ElementTree Elements corresponding to nodes in a test that specify viewport sizes""" return self.root.findall( ".//{http://www.w3.org/1999/xhtml}meta[@name='viewport-size']") @cached_property def viewport_size(self): """The viewport size of a test or reference file""" if self.root is None: return None if not self.viewport_nodes: return None return self.viewport_nodes[0].attrib.get("content", None) @cached_property def dpi_nodes(self): """List of ElementTree Elements corresponding to nodes in a test that specify device pixel ratios""" return self.root.findall( ".//{http://www.w3.org/1999/xhtml}meta[@name='device-pixel-ratio']" ) @cached_property def dpi(self): """The device pixel ratio of a test or reference file""" if self.root is None: return None if not self.dpi_nodes: return None return self.dpi_nodes[0].attrib.get("content", None) @cached_property def testharness_nodes(self): """List of ElementTree Elements corresponding to nodes representing a testharness.js script""" return self.root.findall( ".//{http://www.w3.org/1999/xhtml}script[@src='/resources/testharness.js']" ) @cached_property def content_is_testharness(self): """Boolean indicating whether the file content represents a testharness.js test""" if self.root is None: return None return bool(self.testharness_nodes) @cached_property def variant_nodes(self): """List of ElementTree Elements corresponding to nodes representing a test variant""" return self.root.findall( ".//{http://www.w3.org/1999/xhtml}meta[@name='variant']") @cached_property def test_variants(self): rv = [] if self.ext == ".js": for (key, value) in self.script_metadata: if key == b"variant": rv.append(value.decode("utf-8")) else: for element in self.variant_nodes: if "content" in element.attrib: variant = element.attrib["content"] rv.append(variant) for variant in rv: assert variant == "" or variant[0] in ["#", "?"], variant if not rv: rv = [""] return rv @cached_property def testdriver_nodes(self): """List of ElementTree Elements corresponding to nodes representing a testdriver.js script""" return self.root.findall( ".//{http://www.w3.org/1999/xhtml}script[@src='/resources/testdriver.js']" ) @cached_property def has_testdriver(self): """Boolean indicating whether the file content represents a testharness.js test""" if self.root is None: return None return bool(self.testdriver_nodes) @cached_property def reftest_nodes(self): """List of ElementTree Elements corresponding to nodes representing a to a reftest <link>""" if self.root is None: return [] match_links = self.root.findall( ".//{http://www.w3.org/1999/xhtml}link[@rel='match']") mismatch_links = self.root.findall( ".//{http://www.w3.org/1999/xhtml}link[@rel='mismatch']") return match_links + mismatch_links @cached_property def references(self): """List of (ref_url, relation) tuples for any reftest references specified in the file""" rv = [] rel_map = {"match": "==", "mismatch": "!="} for item in self.reftest_nodes: if "href" in item.attrib: ref_url = urljoin(self.url, item.attrib["href"].strip(space_chars)) ref_type = rel_map[item.attrib["rel"]] rv.append((ref_url, ref_type)) return rv @cached_property def content_is_ref_node(self): """Boolean indicating whether the file is a non-leaf node in a reftest graph (i.e. if it contains any <link rel=[mis]match>""" return bool(self.references) @cached_property def css_flag_nodes(self): """List of ElementTree Elements corresponding to nodes representing a flag <meta>""" if self.root is None: return [] return self.root.findall( ".//{http://www.w3.org/1999/xhtml}meta[@name='flags']") @cached_property def css_flags(self): """Set of flags specified in the file""" rv = set() for item in self.css_flag_nodes: if "content" in item.attrib: for flag in item.attrib["content"].split(): rv.add(flag) return rv @cached_property def content_is_css_manual(self): """Boolean indicating whether the file content represents a CSS WG-style manual test""" if self.root is None: return None # return True if the intersection between the two sets is non-empty return bool( self.css_flags & { "animated", "font", "history", "interact", "paged", "speech", "userstyle" }) @cached_property def spec_link_nodes(self): """List of ElementTree Elements corresponding to nodes representing a <link rel=help>, used to point to specs""" if self.root is None: return [] return self.root.findall( ".//{http://www.w3.org/1999/xhtml}link[@rel='help']") @cached_property def spec_links(self): """Set of spec links specified in the file""" rv = set() for item in self.spec_link_nodes: if "href" in item.attrib: rv.add(item.attrib["href"].strip(space_chars)) return rv @cached_property def content_is_css_visual(self): """Boolean indicating whether the file content represents a CSS WG-style visual test""" if self.root is None: return None return bool( self.ext in {'.xht', '.html', '.xhtml', '.htm', '.xml', '.svg'} and self.spec_links) @property def type(self): rv, _ = self.manifest_items() return rv def manifest_items(self): """List of manifest items corresponding to the file. There is typically one per test, but in the case of reftests a node may have corresponding manifest items without being a test itself.""" if self.items_cache: return self.items_cache if self.name_is_non_test: rv = "support", [SupportFile(self)] elif self.name_is_stub: rv = Stub.item_type, [Stub(self, self.url)] elif self.name_is_manual: rv = ManualTest.item_type, [ManualTest(self, self.url)] elif self.name_is_conformance: rv = ConformanceCheckerTest.item_type, [ ConformanceCheckerTest(self, self.url) ] elif self.name_is_conformance_support: rv = "support", [SupportFile(self)] elif self.name_is_visual: rv = VisualTest.item_type, [VisualTest(self, self.url)] elif self.name_is_multi_global: globals = b"" for (key, value) in self.script_metadata: if key == b"global": globals = value break tests = [ TestharnessTest(self, global_variant_url(self.url, suffix) + variant, timeout=self.timeout) for suffix in sorted(global_suffixes(globals)) for variant in self.test_variants ] rv = TestharnessTest.item_type, tests elif self.name_is_worker: test_url = replace_end(self.url, ".worker.js", ".worker.html") tests = [ TestharnessTest(self, test_url + variant, timeout=self.timeout) for variant in self.test_variants ] rv = TestharnessTest.item_type, tests elif self.name_is_window: test_url = replace_end(self.url, ".window.js", ".window.html") tests = [ TestharnessTest(self, test_url + variant, timeout=self.timeout) for variant in self.test_variants ] rv = TestharnessTest.item_type, tests elif self.name_is_webdriver: rv = WebdriverSpecTest.item_type, [ WebdriverSpecTest(self, self.url, timeout=self.timeout) ] elif self.content_is_css_manual and not self.name_is_reference: rv = ManualTest.item_type, [ManualTest(self, self.url)] elif self.content_is_testharness: rv = TestharnessTest.item_type, [] testdriver = self.has_testdriver for variant in self.test_variants: url = self.url + variant rv[1].append( TestharnessTest(self, url, timeout=self.timeout, testdriver=testdriver)) elif self.content_is_ref_node: rv = (RefTestNode.item_type, [ RefTestNode(self, self.url, self.references, timeout=self.timeout, viewport_size=self.viewport_size, dpi=self.dpi) ]) elif self.content_is_css_visual and not self.name_is_reference: rv = VisualTest.item_type, [VisualTest(self, self.url)] else: rv = "support", [SupportFile(self)] self.items_cache = rv return rv
def __init__(self, url: str, content: str) -> None: super().__init__(url=url) self._parsed = html5lib.parse(content, namespaceHTMLElements=False)
def parseDocument(text): doc = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False) return doc
def html5_parse(data, max_nesting_depth=100): import html5lib # html5lib bug: http://code.google.com/p/html5lib/issues/detail?id=195 data = re.sub(r'<\s*title\s*[^>]*/\s*>', '<title></title>', data) data = html5lib.parse(data, treebuilder='lxml').getroot() # Check that the asinine HTML 5 algorithm did not result in a tree with # insane nesting depths for x in data.iterdescendants(): if isinstance(x.tag, basestring) and len(x) is 0: # Leaf node depth = node_depth(x) if depth > max_nesting_depth: raise ValueError('html5lib resulted in a tree with nesting' ' depth > %d' % max_nesting_depth) # Set lang correctly xl = data.attrib.pop('xmlU0003Alang', None) if xl is not None and 'lang' not in data.attrib: data.attrib['lang'] = xl # html5lib has the most inelegant handling of namespaces I have ever seen # Try to reconstitute destroyed namespace info xmlns_declaration = '{%s}' % XMLNS_NS non_html5_namespaces = {} seen_namespaces = set() for elem in tuple(data.iter(tag=etree.Element)): elem.attrib.pop('xmlns', None) namespaces = {} for x in tuple(elem.attrib): if x.startswith('xmlnsU') or x.startswith(xmlns_declaration): # A namespace declaration val = elem.attrib.pop(x) if x.startswith('xmlnsU0003A'): prefix = x[11:] namespaces[prefix] = val if namespaces: # Some destroyed namespace declarations were found p = elem.getparent() if p is None: # We handle the root node later non_html5_namespaces = namespaces else: idx = p.index(elem) p.remove(elem) elem = clone_element(elem, nsmap=namespaces) p.insert(idx, elem) b = barename(elem.tag) idx = b.find('U0003A') if idx > -1: prefix, tag = b[:idx], b[idx + 6:] ns = elem.nsmap.get(prefix, None) if ns is None: ns = non_html5_namespaces.get(prefix, None) if ns is not None: elem.tag = '{%s}%s' % (ns, tag) for b in tuple(elem.attrib): idx = b.find('U0003A') if idx > -1: prefix, tag = b[:idx], b[idx + 6:] ns = elem.nsmap.get(prefix, None) if ns is None: ns = non_html5_namespaces.get(prefix, None) if ns is not None: elem.attrib['{%s}%s' % (ns, tag)] = elem.attrib.pop(b) seen_namespaces |= set(elem.nsmap.itervalues()) nsmap = dict(html5lib.constants.namespaces) nsmap[None] = nsmap.pop('html') non_html5_namespaces.update(nsmap) nsmap = non_html5_namespaces data = clone_element(data, nsmap=nsmap, in_context=False) # Remove unused namespace declarations fnsmap = { k: v for k, v in nsmap.iteritems() if v in seen_namespaces and v != XMLNS_NS } return clone_element(data, nsmap=fnsmap, in_context=False)
def __init__(self, content, url, headers=None, trusted=None): self.content = content self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False) self.url = url self.headers = headers self.trusted = trusted
def __init__(self, parent, filename, test_type): self.url = parent.session.config.server.url(filename) self.type = test_type self.variants = [] # Some tests are reliant on the WPT servers substitution functionality, # so tests must be retrieved from the server rather than read from the # file system directly. handle = urllib.request.urlopen( self.url, context=parent.session.config.ssl_context) try: markup = handle.read() finally: handle.close() if test_type not in TEST_TYPES: raise ValueError('Unrecognized test type: "%s"' % test_type) parsed = html5lib.parse(markup, namespaceHTMLElements=False) name = None includes_variants_script = False self.expected = None for element in parsed.getiterator(): if not name and element.tag == 'title': name = element.text continue if element.tag == 'meta' and element.attrib.get( 'name') == 'variant': self.variants.append(element.attrib.get('content')) continue if element.tag == 'script': if element.attrib.get('id') == 'expected': try: self.expected = json.loads(text_type(element.text)) except ValueError: print("Failed parsing JSON in %s" % filename) raise src = element.attrib.get('src', '') if 'variants.js' in src: includes_variants_script = True if not resolve_uri(filename, src): raise ValueError( 'Could not resolve path "%s" from %s' % (src, filename)) if not name: raise ValueError('No name found in %s add a <title> element' % filename) elif self.type == 'functional': if not self.expected: raise ValueError( 'Functional tests must specify expected report data') if not includes_variants_script: raise ValueError( 'No variants script found in file %s add ' '\'<script src="../../variants.js"></script>\'' % filename) if len(self.variants) == 0: self.variants = DEFAULT_VARIANTS elif self.type == 'unit' and self.expected: raise ValueError( 'Unit tests must not specify expected report data') # Ensure that distinct items have distinct fspath attributes. # This is necessary because pytest has an internal cache keyed on it, # and only the first test with any given fspath will be run. # # This cannot use super(HTMLItem, self).__init__(..) because only the # Collector constructor takes the fspath argument. pytest.Item.__init__(self, name, parent) pytest.Collector.__init__(self, name, parent, fspath=py.path.local(filename))
def htmlparse(t, encoding=None): return html5lib.parse(t, treebuilder='lxml', namespaceHTMLElements=False)
def parse_html(html: str) -> ET.Element: '''Parse html and return an element tree, removing namespace.''' doc = html5lib.parse(html) doc = remove_namespace(doc) return doc
def _parse_html(f): # type: (BinaryIO) -> ElementTree.ElementTree doc = html5lib.parse(f, treebuilder="etree", useChardet=False) if MYPY: return cast(ElementTree.ElementTree, doc) return doc
def get_player_stats(team_url, season, league_name, results_array, goalie_results_array): if results_array is None: results_array = [] if len(results_array) == 0: results_array.append(['Name', 'Position', 'Season', 'League', 'Team', 'GP', 'G', 'A', 'TP', 'PIM', '+/-', 'ID']) if goalie_results_array is None: goalie_results_array = [] if len(goalie_results_array) == 0: goalie_results_array.append( ['Name', 'Season', 'League', 'Team', 'GP', 'GAA', 'SV%', 'ID']) team_search_request = requests.get(team_url + '?tab=stats#players') team_page = html5lib.parse(team_search_request.text) team_name = team_page.find('.//*[@id="name-and-logo"]/{0}h1'.format(helpers.html_prefix)).text.strip() player_table = team_page.find( './/*[@id="players"]/{0}div[1]/{0}div[4]/{0}table'.format(helpers.html_prefix)) goalies_table = team_page.find( './/*[@id="players"]/{0}div[2]/{0}div[2]/{0}table'.format(helpers.html_prefix)) players_grouped = helpers.get_ep_table_rows(player_table) goalies_grouped = helpers.get_ep_table_rows(goalies_table) for group in players_grouped: for player in group: player_stats = player.findall( './/{}td'.format(helpers.html_prefix)) name_link = player_stats[NAME].find( './{0}span/{0}a'.format(helpers.html_prefix)) name, position = helpers.get_info_from_player_name(name_link.text) id = helpers.get_player_id_from_url( name_link.attrib['href']) games = player_stats[GAMES].text.strip() goals = player_stats[GOALS].text.strip() assists = player_stats[ASSISTS].text.strip() points = player_stats[POINTS].text.strip() pim = player_stats[PIM].text.strip() plusminus = player_stats[PLUSMINUS].text.strip() results_array.append([ name, position, season, league_name, team_name, games, goals, assists, points, pim, plusminus, id, ]) for goalie_group in goalies_grouped: for goalie in goalie_group: goalie_stats = goalie.findall('./{}td'.format(helpers.html_prefix)) name_link = goalie_stats[GOALIE_NAME].find( './{0}a'.format(helpers.html_prefix)) name = name_link.text.strip() id = helpers.get_player_id_from_url( name_link.attrib['href']) games = goalie_stats[GOALIE_GP].text.strip() gaa = goalie_stats[GOALIE_GAA].text.strip() svp = goalie_stats[GOALIE_SVP].text.strip() goalie_results_array.append([ name, season, league_name, team_name, games, gaa, svp, id, ])