Exemple #1
0
 def __init__(self, guess=None, filename=None, url=None, file_obj=None,
              string=None, tree=None, encoding=None, base_url=None,
              url_fetcher=default_url_fetcher, media_type='print'):
     result = _select_source(
         guess, filename, url, file_obj, string, tree, base_url,
         url_fetcher)
     with result as (source_type, source, base_url, protocol_encoding):
         if source_type == 'tree':
             result = source
         else:
             if isinstance(source, unicode):
                 result = html5lib.parse(
                     source, treebuilder='lxml',
                     namespaceHTMLElements=False)
             else:
                 result = html5lib.parse(
                     source, treebuilder='lxml', override_encoding=encoding,
                     transport_encoding=protocol_encoding,
                     namespaceHTMLElements=False)
             assert result
     base_url = find_base_url(result, base_url)
     if hasattr(result, 'getroot'):
         result.docinfo.URL = base_url
         result = result.getroot()
     else:
         result.getroottree().docinfo.URL = base_url
     self.root_element = result
     self.base_url = base_url
     self.url_fetcher = url_fetcher
     self.media_type = media_type
Exemple #2
0
def iter_datatable(session, url, **kwargs):
    url += '&numResults=1000&startIndex=0'
    l = blackboard.slowlog()
    response = session.get(url)
    if kwargs.pop('edit_mode', False):
        response = session.ensure_edit_mode(response)
    l("Fetching datatable page 1 took %.4f s")
    history = list(response.history) + [response]
    document = html5lib.parse(response.content, encoding=response.encoding)
    keys, rows = parse_datatable(response, document, **kwargs)
    yield keys
    yield from rows
    next_id = 'listContainer_nextpage_top'
    next_o = document.find('.//h:a[@id="%s"]' % next_id, NS)
    page_number = 1
    while next_o:
        page_number += 1
        url = urljoin(response.url, next_o.get('href'))
        l = blackboard.slowlog()
        response = session.get(url)
        l("Fetching datatable page %d took %.4f s", page_number)
        history += list(response.history) + [response]
        document = html5lib.parse(response.content, encoding=response.encoding)
        keys_, rows = parse_datatable(response, document, **kwargs)
        if keys != keys_:
            raise ValueError(
                "Page %d keys (%r) do not match page 1 keys (%r)" %
                (page_number, keys_, keys))
        next_o = document.find('.//h:a[@id="%s"]' % next_id, NS)
        yield from rows
    response.history = history[:-1]
    yield response
Exemple #3
0
def test_index_html(request_factory):
    with request_factory(matched_route='home', map=None) as req:
        for rsc in _RESOURCES:
            if not hasattr(rsc.model, 'first') or not rsc.with_index:
                continue
            dt = req.get_datatable(rsc.name + 's', rsc.model)
            res = render('%s/index_html.mako' % rsc.name, {'ctx': dt}, request=req)
            html5lib.parse(res)
 def test_detail_html(self):
     self.set_request_properties(matched_route=Route(), map=None)
     for rsc in _RESOURCES:
         if not hasattr(rsc.model, 'first'):
             continue
         res = render(
             '%s/detail_html.mako' % rsc.name,
             dict(ctx=rsc.model.first()),
             request=self.env['request'])
         html5lib.parse(res)
 def test_index_html(self):
     self.set_request_properties(matched_route=Route(), map=None)
     for rsc in _RESOURCES:
         if not hasattr(rsc.model, 'first') or not rsc.with_index:
             continue
         dt = self.env['request'].get_datatable(rsc.name + 's', rsc.model)
         res = render(
             '%s/index_html.mako' % rsc.name,
             dict(ctx=dt),
             request=self.env['request'])
         html5lib.parse(res)
Exemple #6
0
def test_detail_html(request_factory):
    with request_factory(matched_route='home', map=None) as req:
        for rsc in _RESOURCES:
            if not hasattr(rsc.model, 'first'):
                continue
            res = render(
                '%s/detail_html.mako' % rsc.name, {'ctx': rsc.model.first()}, request=req)
            html5lib.parse(res)
            if rsc.name == 'dataset':
                assert 'http://example.org/privacy' in res
                assert 'Privacy Policy' in res
Exemple #7
0
def huge_bench_html5lib_etree(files):
    etree_doc = html5lib.parse(files['template.html'])
    assert etree_doc.tag == '{http://www.w3.org/1999/xhtml}html'
    print('  template done;')

    etree_doc2 = html5lib.parse(files['spec.html'])
    assert etree_doc2.tag == '{http://www.w3.org/1999/xhtml}html'
    print('  spec done;')

    etree_doc3 = html5lib.parse(files['py33_py34.html'])
    assert etree_doc3.tag == '{http://www.w3.org/1999/xhtml}html'
    print('  py33_py34 done.')
Exemple #8
0
def test_html5lib(count, spec_data):
    # No warm-up runs for this benchmark; in real life, the parser doesn't get
    # to warm up (this isn't a daemon process).

    times = []
    for _ in range(count):
        spec_data.seek(0)
        t0 = time.time()
        html5lib.parse(spec_data)
        t1 = time.time()
        times.append(t1 - t0)
    return times
Exemple #9
0
def huge_bench_html5lib_dom(files):
    print('html5lib dom:')
    dom_doc = html5lib.parse(files['template.html'], treebuilder="dom")
    assert len(dom_doc.toxml()) > 1024
    print('  template done;')

    dom_doc2 = html5lib.parse(files['spec.html'], treebuilder="dom")
    assert len(dom_doc2.toxml()) > 1024
    print('  spec done;')

    dom_doc3 = html5lib.parse(files['py33_py34.html'], treebuilder="dom")
    assert len(dom_doc3.toxml()) > 1024
    print('  py33_py34 done.')
Exemple #10
0
def huge_bench_html5lib_lxml(files):
    print('html5lib lxml:')
    lxml_doc = html5lib.parse(files['template.html'], treebuilder="lxml")
    assert lxml_doc.getroot().tag == '{http://www.w3.org/1999/xhtml}html'
    print('  template done;')

    lxml_doc2 = html5lib.parse(files['spec.html'], treebuilder="lxml")
    assert lxml_doc2.getroot().tag == '{http://www.w3.org/1999/xhtml}html'
    print('  spec done;')

    lxml_doc3 = html5lib.parse(files['py33_py34.html'], treebuilder="lxml")
    assert lxml_doc3.getroot().tag == '{http://www.w3.org/1999/xhtml}html'
    print('  py33_py34 done.')
Exemple #11
0
def fetch_data(download_dir):
    # Fetch the index page to get a CSRF token.
    r = requests.get('https://www.runtastic.com/')
    if r.status_code != 200:
        raise 'Sucks'
    cookies = dict(r.cookies)
    doc = html5lib.parse(r.text, treebuilder='dom')
    csrf = get_csrf_token(doc)
    # Now log in.
    user, pw = read_user_pass()
    login = dict(csrf)
    login['user[email]'] = user
    login['user[password]'] = pw
    r2 = requests.post('https://www.runtastic.com/en/d/users/sign_in.json', data=login, cookies=cookies)
    if r2.status_code != 200:
        raise 'Sucks 2'
    cookies.update(r2.cookies)
    j = r2.json()
    if not j['success']:
        raise 'Login failed'
    doc = html5lib.parse(j['update'], treebuilder='dom')
    # Find the sport-sessions page and fetch it to get a User ID
    # and a list of session IDs.
    links = [l.getAttribute('href') for l in doc.getElementsByTagName('a') if l.getAttribute('href').endswith('/sport-sessions')]
    sessions_url = urlparse.urljoin(r2.url, links[0])
    r3 = requests.get(sessions_url, cookies=cookies)
    if r3.status_code != 200:
        raise 'Sucks 3'
    cookies.update(r3.cookies)
    doc = html5lib.parse(r3.text, treebuilder='dom')
    uid = get_user_id(doc)
    data = get_data(doc)
    # Now hit the API to get data about each session.
    request_data = dict(csrf)
    request_data['user_id'] = uid
    request_data['items'] = ','.join(str(d[0]) for d in data)
    r4 = requests.post('https://www.runtastic.com/api/run_sessions/json',
                       cookies=cookies,
                       data=request_data)
    if r4.status_code != 200:
        raise 'Sucks 4'
    cookies.update(r4.cookies)
    sessions = r4.json()
    known_sessions = read_known_sessions()
    for s in sessions:
        if s['id'] in known_sessions:
            continue
        if check_download_session(urlparse.urljoin(r4.url, s['page_url']) + '.tcx', download_dir, cookies):
            known_sessions.add(s['id'])
    write_known_sessions(known_sessions)
Exemple #12
0
def parse_html(file_obj_or_str, url=None):
    '''Discovers various metadata URLs embedded in a given HTML document, such
    as feeds and RDF.

    :param file_obj_or_str: The HTML document to be parsed.
    :type file_obj_or_str: a file-like object or :class:`str`
    :param url: The URL that the HTML document was retrieved from.
    :type url: :class:`str` or :const:`None`
    :returns: A dictionary, where the key is the URL's MIME type and the value
              is a dictionary of URL-title pairs.
    :rtype: :class:`dict`
    '''
    urls = {}

    # load the modules only when the function is first called.
    if not _MODULE_CACHE:
        for name, module in MODULES.iteritems():
            mod = import_module(module, package=__name__)
            try:
                _MODULE_CACHE[name] = mod.Discoverer
            except AttributeError:
                raise AttributeError('''\
Could not find a Discoverer object in the %s module.''' % name)

    doc = html5lib.parse(file_obj_or_str, treebuilder='lxml')
    print type(doc)
    for name, discoverer in _MODULE_CACHE.iteritems():
        urls[name] = discoverer.parse(doc, url=url)
    return urls
def cleanup_html(html):
    """Cleanups malformed and wrongly-encoded HTML.
    Returns UTF-8 encoded well-formed HTML"""
    h = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False)
    stream = StringIO()
    h.write(stream, encoding='utf-8')
    return stream.getvalue()
Exemple #14
0
def test_to_sax():
    handler = support.TracingSaxHandler()
    tree = html5lib.parse("""<html xml:lang="en">
        <title>Directory Listing</title>
        <a href="/"><b/></p>
    """, treebuilder="etree")
    walker = getTreeWalker("etree")
    sax.to_sax(walker(tree), handler)
    expected = [
        'startDocument',
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'),
            'html', {(None, 'xml:lang'): 'en'}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}),
        ('characters', 'Directory Listing'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
        ('characters', '\n        '),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
        ('startElementNS',  ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'),
        ('characters', '\n    '),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'),
        'endDocument',
    ]
    assert expected == handler.visited
Exemple #15
0
 def _parse_content(self, content):
     document = html5lib.parse(content,
                               treebuilder="lxml",
                               namespaceHTMLElements=False)
     for tbl in document.xpath('.//table'):
         ths = tbl.xpath('.//tr//th')
         if len(ths) == 0 or ths[0].text != 'Auction Date':
             continue
         for row in tbl.xpath('.//tr')[1:]:
             row_data = row.xpath('td')
             if len(row_data) == 0 or row_data[0].text.strip() == '':
                 continue
             str_good_date = sub('([0-9]*)[a-z]*( [A-z]* [0-9]*)',r'\1\2',row_data[0].text.strip())
             dtt = datetime.strptime(str_good_date, "%d %B %Y").date()
             auction_info = {'date': dtt,
                             'average_price': _convert_type(row_data[1].text.strip()[1:], 'float'),
                             'lowest_price': _convert_type(row_data[2].text.strip()[1:], 'float'),
                             'total_volume': _convert_type(row_data[3].text or '0', 'int'),
                             'co_fired_volume': _convert_type(row_data[4].text or '0', 'int'),
                             'period': "{}{:02d}".format(dtt.year, dtt.month)}
             self.auctions.append(auction_info)
     for info in self.auctions:
         if info['period'] in self.periods:
             previous = self.periods[info['period']]
             if not isinstance(previous, list):
                 self.periods[info['period']] = [info['average_price'], previous]
             else:
                 self.periods[info['period']].append(info['average_price'])
         else:
             self.periods[info['period']] = info['average_price']
     for key in self.periods.keys():
         if isinstance(self.periods[key], list):
             self.periods[key] = sum(self.periods[key]) / len(self.periods[key])
     return True
Exemple #16
0
 def assertADWithRawHTML(self, params, expected, template=None):
     if not template:
         template = HTML5_TEMPLATE
     html = template % params
     doc = html5lib.parse(html, treebuilder='lxml')
     feeds = self.parse_with_base(doc)
     self.assertEqual(feeds, expected)
Exemple #17
0
    def metadata(self) -> str:
        """해당 비디오의 메타데이터를 반환합니다.

        :return: 비디오의  메타데이터
        :rtype: :class:`str`
        """
        if self._metadata:
            return self._metadata
        response = urlopen(self.to_url)
        if response.status != 200:
            raise Exception('status_code: {}, url: {}'.format(
                response.status, self.to_url))
        html = html5lib.parse(response.readall().decode('utf-8'),
                              namespaceHTMLElements=False)
        xpath_builder = "./head//meta[@{attr}='{value}']".format
        query = [
            ('property', 'og:title'),
            ('property', 'og:image'),
            ('name', 'keywords')
        ]
        for attr, value in query:
            elem = html.findall(xpath_builder(attr=attr, value=value))
            if elem:
                self.metadata[value] = elem[0].get('content')
            self._metadata.setdefault(value, None)
        return self._metadata
Exemple #18
0
    def parse_comments(self, root, raw):
        ans = ''
        ns = tuple(self.selector('#bookDescription_feature_div noscript'))
        if ns:
            ns = ns[0]
            if len(ns) == 0 and ns.text:
                import html5lib
                # html5lib parsed noscript as CDATA
                ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
            else:
                ns.tag = 'div'
            ans = self._render_comments(ns)
        else:
            desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
            if desc:
                ans = self._render_comments(desc[0])

        desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
        if desc:
            ans += self._render_comments(desc[0])
        else:
            # Idiot chickens from amazon strike again. This data is now stored
            # in a JS variable inside a script tag URL encoded.
            m = re.search(b'var\s+iframeContent\s*=\s*"([^"]+)"', raw)
            if m is not None:
                try:
                    text = unquote(m.group(1)).decode('utf-8')
                    nr = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False)
                    desc = nr.xpath('//div[@id="productDescription"]/*[@class="content"]')
                    if desc:
                        ans += self._render_comments(desc[0])
                except Exception as e:
                    self.log.warn('Parsing of obfuscated product description failed with error: %s' % as_unicode(e))

        return ans
Exemple #19
0
    def __init__(self, filename, test_type, parent):
        self.url = parent.session.config.server.url(filename)
        self.type = test_type
        self.variants = []
        # Some tests are reliant on the WPT servers substitution functionality,
        # so tests must be retrieved from the server rather than read from the
        # file system directly.
        handle = urllib.request.urlopen(self.url,
                                        context=parent.session.config.ssl_context)
        try:
            markup = handle.read()
        finally:
            handle.close()

        if test_type not in TEST_TYPES:
            raise ValueError('Unrecognized test type: "%s"' % test_type)

        parsed = html5lib.parse(markup, namespaceHTMLElements=False)
        name = None
        includes_variants_script = False
        self.expected = None

        for element in parsed.getiterator():
            if not name and element.tag == 'title':
                name = element.text
                continue
            if element.tag == 'meta' and element.attrib.get('name') == 'variant':
                self.variants.append(element.attrib.get('content'))
                continue
            if element.tag == 'script':
                if element.attrib.get('id') == 'expected':
                    self.expected = json.loads(text_type(element.text))

                src = element.attrib.get('src', '')

                if 'variants.js' in src:
                    includes_variants_script = True
                    if not resolve_uri(filename, src):
                        raise ValueError('Could not resolve path "%s" from %s' % (src, filename))

        if not name:
            raise ValueError('No name found in file: %s' % filename)
        elif self.type == 'functional':
            if not self.expected:
                raise ValueError('Functional tests must specify expected report data')
            if not includes_variants_script:
                raise ValueError('No variants script found in file: %s' % filename)
            if len(self.variants) == 0:
                raise ValueError('No test variants specified in file %s' % filename)
        elif self.type == 'unit' and self.expected:
            raise ValueError('Unit tests must not specify expected report data')

        # Ensure that distinct items have distinct fspath attributes.
        # This is necessary because pytest has an internal cache keyed on it,
        # and only the first test with any given fspath will be run.
        #
        # This cannot use super(HTMLItem, self).__init__(..) because only the
        # Collector constructor takes the fspath argument.
        pytest.Item.__init__(self, name, parent)
        pytest.Collector.__init__(self, name, parent, fspath=py.path.local(filename))
Exemple #20
0
def get_courses(args, parameter, value):
    filename = 'courses-%s-%s.json' % (parameter, value)
    if args.cached and os.path.exists(filename):
        with open(filename) as fp:
            return json.load(fp)

    url = (
        'http://kursuskatalog.au.dk/coursecatalog/Course/ajaxsearch/' +
        '?tx_aucoursecatalog_pi1[%s]=%s') % (parameter, value)
    logger.debug('Retrieve %r', url)
    r = requests.get(url)
    logger.debug('%d bytes', len(r.content))
    document = html5lib.parse(r.content, encoding=r.encoding)
    ns = {'h': 'http://www.w3.org/1999/xhtml'}
    path = ".//h:tbody/h:tr"
    rows = document.findall(path, ns)
    courses = []
    for row in rows:
        path = "./h:td"
        cells = row.findall(path, ns)
        cellTexts = [' '.join(''.join(cell.itertext()).split())
                     for cell in cells]
        courseCell = cells[0]
        path = './h:a'
        courseLink = courseCell.find(path, ns)
        coursePath = courseLink.get('href')
        course = dict(zip(
            'courseName level ects period courseLanguage institut'.split(),
            cellTexts))
        course['link'] = coursePath
        courses.append(course)
    with open(filename, 'w') as fp:
        json.dump(courses, fp, indent=2)
    return courses
def scrape_registrar(course, url, css_query, max_capacity):
    try:
        r = requests.get(url)
    except:
        print("Connection reset. Retrying.")
        send_email('Connection reset', 'Retrying')
        return

    raw_html = r.text
    page = html5lib.parse(raw_html, treebuilder='lxml', namespaceHTMLElements=False)
    selector = lxml.cssselect.CSSSelector(css_query)
    match = selector(page)
    status = match[0].text
    print('current: %s, capcity: %s' % (status, max_capacity))

    if status != max_capacity:
        subj = '%s is availible, sign up now!\n' % course
        body = 'https://be.my.ucla.edu/ClassPlanner/ClassPlan.aspx'

        print(subj)
        send_email(subj, body)
        return True
    else:
        print('%s is still closed :(\n' % course)
        return False
Exemple #22
0
def get_top_result(title, artist):
  r = requests.get(BASE_URL,params={'q':title})
  raw_html = r.text
  page = html5lib.parse(raw_html,
                        treebuilder='lxml',
                        namespaceHTMLElements=False)

  # Find result with closest matching artist
  css_query = '.sen b'
  selector = lxml.cssselect.CSSSelector(css_query)
  match = selector(page)
  top_match = 100
  index = 0
  count = 0
  for m in match:
    if m.text is not None and m.text.upper() == m.text:
      lev_dist = levenshtein(m.text.lower(), artist.lower())
      if lev_dist < top_match:
        top_match = lev_dist
        index = count
      count+=1

  if top_match < 5:
    css_query = '.sen a'
    selector = lxml.cssselect.CSSSelector(css_query)
    match = selector(page)
    if index < len(match):
      return ''.join([char for char in match[index].get('href')])
  return None
Exemple #23
0
 def get_image_urls(self, title, author, log, abort, timeout):
     from calibre.utils.cleantext import clean_ascii_chars
     from urllib import urlencode
     import html5lib
     import json
     from collections import OrderedDict
     ans = OrderedDict()
     br = self.browser
     q = urlencode({'as_q': ('%s %s'%(title, author)).encode('utf-8')}).decode('utf-8')
     sz = self.prefs['size']
     if sz == 'any':
         sz = ''
     elif sz == 'l':
         sz = 'isz:l,'
     else:
         sz = 'isz:lt,islt:%s,' % sz
     # See https://www.google.com/advanced_image_search to understand this
     # URL scheme
     url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz)
     log('Search URL: ' + url)
     raw = br.open(url).read().decode('utf-8')
     root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False)
     for div in root.xpath('//div[@class="rg_meta"]'):
         try:
             data = json.loads(div.text)
         except Exception:
             continue
         if 'ou' in data:
             ans[data['ou']] = True
     return list(ans.iterkeys())
Exemple #24
0
    def __init__(self, *args, **kwargs):
        self.__url__ = None
        self.__doc__ = None
        self.__parsed__ = {"items": [], "rels": {}}

        if len(args) > 0:
            if type(args[0]) is file:
                # load file
                self.__doc__ = html5lib.parse(args[0], treebuilder="dom")
                if len(args) > 1 and (type(args[1]) is str or type(args[1]) is unicode):
                    self.__url__ = args[1] #TODO: parse this properly
            elif type(args[0]) is str or type(args[0]) is unicode:
                pass
                # load URL

        # test for base
        if self.__doc__ is not None and self.__url__ is None:
            poss_bases = self.__doc__.getElementsByTagName("base")
            actual_base = None
            if len(poss_bases) is not 0:
                for poss_base in poss_bases:
                    if poss_base.hasAttribute("href"):
                        # check to see if absolute
                        if urlparse(poss_base.getAttribute("href")).netloc is not '':
                            self.__url__ = poss_base.getAttribute("href")

        if self.__doc__ is not None:
            # parse!
            self.__doc__.documentElement.apply_backcompat_rules()
            self.parse()
Exemple #25
0
def thread(data, default=u"Untitled.", id=None):
    """
    Extract <h1> title from web page. The title is *probably* the text node,
    which is the nearest H1 node in context to an element with the `isso-thread` id.
    """

    html = html5lib.parse(data, treebuilder="dom")

    assert html.lastChild.nodeName == "html"
    html = html.lastChild

    # aka getElementById, but limited to div and section tags
    el = list(filter(lambda i: i.attributes["id"].value == "isso-thread",
              filter(lambda i: "id" in i.attributes,
                     chain(*map(html.getElementsByTagName, ("div", "section"))))))

    if not el:
        return id, default

    el = el[0]
    visited = []

    def recurse(node):
        for child in node.childNodes:
            if child.nodeType != child.ELEMENT_NODE:
                continue
            if child.nodeName.upper() == "H1":
                return child
            if child not in visited:
                return recurse(child)

    def gettext(rv):
        for child in rv.childNodes:
            if child.nodeType == child.TEXT_NODE:
                yield child.nodeValue
            if child.nodeType == child.ELEMENT_NODE:
                for item in gettext(child):
                    yield item

    try:
        id = unquote(el.attributes["data-isso-id"].value)
    except (KeyError, AttributeError):
        pass

    try:
        return id, unquote(el.attributes["data-title"].value)
    except (KeyError, AttributeError):
        pass

    while el is not None:  # el.parentNode is None in the very end

        visited.append(el)
        rv = recurse(el)

        if rv:
            return id, ''.join(gettext(rv)).strip()

        el = el.parentNode

    return id, default
def do_parse_test(html, n):
    start = time.time()
    for i in xrange(n):
        tree = html5lib.parse(html)
        tree.name
    stop = time.time()
    print stop - start, "s"
Exemple #27
0
def generate_public_uti_map():
    from lxml import etree
    import html5lib, urllib
    raw = urllib.urlopen(
        'https://developer.apple.com/library/ios/documentation/Miscellaneous/Reference/UTIRef/Articles/System-DeclaredUniformTypeIdentifiers.html').read()
    root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
    tables = root.xpath('//table')[0::2]
    data = {}
    for table in tables:
        for tr in table.xpath('descendant::tr')[1:]:
            td = tr.xpath('descendant::td')
            identifier = etree.tostring(td[0], method='text', encoding=unicode).strip()
            tags = etree.tostring(td[2], method='text', encoding=unicode).strip()
            identifier = identifier.split()[0].replace('\u200b', '')
            exts = [x.strip()[1:].lower() for x in tags.split(',') if x.strip().startswith('.')]
            for ext in exts:
                data[ext] = identifier
    lines = ['PUBLIC_UTI_MAP = {']
    for ext in sorted(data):
        r = ("'" + ext + "':").ljust(16)
        lines.append((' ' * 4) + r + "'" + data[ext] + "',")
    lines.append('}')
    with open(__file__, 'r+b') as f:
        raw = f.read()
        f.seek(0)
        nraw = re.sub(r'^PUBLIC_UTI_MAP = .+?}', '\n'.join(lines), raw, flags=re.MULTILINE | re.DOTALL)
        f.truncate(), f.write(nraw)
def scrape_category (url, title):
    category_slug = slugify (title)

    try:
        f = urlopen (url)
    except ValueError:
        if trace: print 'Retrying:', url
        url = 'http://eracks.com' + url.replace (' ','%20')
        if trace: print 'As:', url
        f = urlopen (url)

    doc = html5lib.parse(f, treebuilder='lxml', namespaceHTMLElements=False)  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html (doc)
    jQuery = PyQuery([doc])

    page_title =  jQuery ('title').text()

    if page_title.startswith ("eRacks Open Source Systems: "):
        page_title = page_title.partition ("eRacks Open Source Systems: ") [-1]

    if page_title.startswith ("eRacks "):
        page_title = page_title.partition ("eRacks ") [-1]

    content = jQuery ('td#content')
    links = content ('a')
    images = content ('img')

    for link in links:
        a = PyQuery (link)
        href = a.attr('href')
        skus = find_sku.findall (href)

        if skus:
            sku = skus [0]
            #a.attr ('href', '/%s/%s/' % (category_slug, slugify (sku)))
            a.attr ('href', '/products/%s/%s/' % (category_slug, sku))
        elif href.startswith ('/Legacy'):
            sku = slugify (href.split ('/') [-1])
            #a.attr ('href', '/%s/%s/' % (category_slug, slugify (sku)))
            a.attr ('href', '/products/%s/%s/' % (category_slug, sku))

        print 'link:', a.attr('href')

    for image in images:
        img = PyQuery (image)
        src = img.attr('src')
        newsrc = getimage (src, 'categories/' + category_slug)
        img.attr ('src', newsrc)
        print 'image:', newsrc

    description = content.html()
    if trace: print description

    if dbteeth:
        cat = Categories.objects.get (name=title)
        cat.comments = cat.comments + '\n\nScraped from Zope as of ' + str(datetime.date.today())
        cat.description = description
        cat.title = page_title
        cat.save()
        print '..saved.'
Exemple #29
0
def chrome_versions():
    if is_ci:
        return []
    print('Getting chrome versions...')
    import html5lib
    raw = download_securely(
        'https://en.wikipedia.org/wiki/Google_Chrome_version_history').decode('utf-8')
    root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
    table = root.xpath('//*[@id="mw-content-text"]//tbody')[-1]
    ans = []
    for tr in table.iterchildren('tr'):
        cells = tuple(tr.iterchildren('td'))
        if not cells:
            continue
        if not cells[2].text or not cells[2].text.strip():
            continue
        s = cells[0].get('style')
        if '#a0e75a' not in s and 'salmon' not in s:
            break
        chrome_version = cells[0].text.strip()
        ts = datetime.strptime(cells[1].text.strip().split()[
                               0], '%Y-%m-%d').date().strftime('%Y-%m-%d')
        try:
            webkit_version = cells[2].text.strip().split()[1]
        except IndexError:
            continue
        ans.append({'date': ts, 'chrome_version': chrome_version,
                    'webkit_version': webkit_version})
    return list(reversed(ans))
Exemple #30
0
def parse_html_to_tree(html):
	'''
		利用html5lib模块规范化html文档内容,从html转换成xml element dom tree
		'''
	#TODO 通过html5lib构造的dom tree数据结构好像不靠谱
	root = html5lib.parse(html, treebuilder="etree")	
	return root
Exemple #31
0
def downloadHTMLFile(url, hasToStartWith):
    print("Download from " + url)
    website = urllib.urlopen(url)
    t = website.read()
    dom = html5lib.parse(t, treebuilder="dom")
    lis = dom.getElementsByTagName("li")
    resultList = []
    for l in lis:
        if len(l.childNodes) > 0:
            ll = l.childNodes[0]
            if len(ll.childNodes) > 0:
                lll = ll.childNodes[0]
                if lll.nodeValue == None:
                    continue
                if lll.nodeValue.startswith("Lists"):
                    break
                nv = lll.nodeValue
                if "(" in lll.nodeValue:
                    nv = lll.nodeValue[:lll.nodeValue.find("(") - 1]
                    print("----->" + nv)

                resultList.append(nv)
                print(lll.nodeValue)
    return resultList
Exemple #32
0
 def __init__(self,
              guess=None,
              filename=None,
              url=None,
              file_obj=None,
              string=None,
              tree=None,
              encoding=None,
              base_url=None,
              url_fetcher=default_url_fetcher,
              media_type='print'):
     result = _select_source(guess, filename, url, file_obj, string, tree,
                             base_url, url_fetcher)
     with result as (source_type, source, base_url, protocol_encoding):
         if source_type == 'tree':
             result = source
         else:
             if not encoding:
                 encoding = protocol_encoding
             if isinstance(source, unicode):
                 encoding = None
             result = html5lib.parse(source,
                                     treebuilder='lxml',
                                     encoding=encoding,
                                     namespaceHTMLElements=False)
             assert result
     base_url = find_base_url(result, base_url)
     if hasattr(result, 'getroot'):
         result.docinfo.URL = base_url
         result = result.getroot()
     else:
         result.getroottree().docinfo.URL = base_url
     self.root_element = result
     self.base_url = base_url
     self.url_fetcher = url_fetcher
     self.media_type = media_type
Exemple #33
0
def test_parse_etree():
    """
    Parsing a fragment to an etree produces a document root element that
    contains the document, including implied tags.
    """
    doc = parse(
        "<!DOCTYPE html><html><title>...</title><p>...</p></html><!-- ... -->",
        treebuilder="etree",
    )
    assert doc.tag == 'DOCUMENT_ROOT'
    [doctype, html, comment] = doc
    assert doctype.tag == "<!DOCTYPE>"
    assert doctype.text == "html"
    assert html.tag == "{http://www.w3.org/1999/xhtml}html"
    assert comment.tag is ElementTree.Comment
    assert comment.text == " ... "
    [head, body] = html
    assert head.tag == "{http://www.w3.org/1999/xhtml}head"
    assert body.tag == "{http://www.w3.org/1999/xhtml}body"
    [title] = head
    assert title.tag == "{http://www.w3.org/1999/xhtml}title"
    [p] = body
    assert p.tag == "{http://www.w3.org/1999/xhtml}p"
    assert p.text == "..."
Exemple #34
0
	def step1(self,fonts):
		# self.width = width # client width
		# self.height= height # client height
		self.fonts = fonts # font family
		self.lineNo = 0 # räknar alla rader i boken

		self.words = [] # unika ord. parallell med widths.
		self.widths = [] # ordets bredd i pixel
		self.index = [] # ordets index i words och widths. Ett index for varje ord i kapitlet/boken

		self.init()

		with open(DIR + '/index.html', 'r', encoding="utf-8") as f:
			html = f.read()
			document = html5lib.parse(html)

		self.traverse(document)
		if len(self.rects) > 0:
			pageNo = int(self.lineNo / LPP)
			self.lineNo = LPP * (pageNo+1)
			self.produceFile()
		self.makeChapter()

		self.makeWords()
# -*- coding: utf-8 -*-
import html5lib

print('通过指定treebuilder来解析:')
document = '<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news</h1><h1 align="right">2018.8.1</h1></body></html>'
#直接调用html5lib.parse来解析,解析时采用lxml构建树的方法
content = html5lib.parse(document,
                         treebuilder="lxml",
                         namespaceHTMLElements=False)
#指定要提取的内容所在的标签路径
rows = content.xpath('/html/body/h1')
for row in rows:
    t = row.xpath('./text()')[0]  #定位到标签节点后,通过text()提取内容
    print(t)

print('通过指定tree来解析:')
document = '<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news</h1><h1 align="right">2018.8.1</h1></body></html>'
#构造HTMLParser实例,指定构造lxml的树
p = html5lib.HTMLParser(strict=False,
                        tree=html5lib.getTreeBuilder('lxml'),
                        namespaceHTMLElements=False)
#解析HTML文档
t = p.parse(document)
rows = t.xpath('/html/body/h1')
for row in rows:
    t = row.xpath('./text()')[0]
    print(t)

print('通过指定tree来提取超链接:')
document = '<html><head><title>Test</title></head><body><a href="www.baidu.com">baidu</body></html>'
p = html5lib.HTMLParser(strict=False,
Exemple #36
0
def main():

    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('filename', metavar='filename', type=str, nargs='+',
                        help='filename with HTML timetable')

    args = parser.parse_args()
    #print(args.filename)

    hour_list = []

    multirow = False
    multirow_no = 0

    with open(args.filename[0], 'rb') as f:
        doc = html5lib.parse(f)
        #print(doc)
        body = doc.find('{http://www.w3.org/1999/xhtml}body')
        #print(body)
        print()
        for e in body:
            if e.tag == '{http://www.w3.org/1999/xhtml}table':
                #print(e)
                #print(e.attrib)
                cl = e.attrib['class']
                if cl == 'grid-border-args':
                    tbody = e.find('{http://www.w3.org/1999/xhtml}tbody')
                    # for each row in the table
                    for day_no, row in enumerate(tbody.findall('{http://www.w3.org/1999/xhtml}tr')):
                        #print(row)
                        # for each time slot
                        for col_no, td in enumerate(row.findall('{http://www.w3.org/1999/xhtml}td')):
                            #print(td.text)
                            #print(sh)
                            if 'class' in td.attrib:
                                cl = td.attrib['class']
                                if cl == 'col-label-one' or cl == 'row-label-one':
                                    #print(td.attrib)
                                    if 'rowspan' in td.attrib:
                                        if td.attrib['rowspan'] != '1':
                                            #print('Anomaly')
                                            multirow = True
                                        else:
                                            multirow = False
                                    text = td.text if td.text else '       '
                                    if text in DAYS:
                                        max_day_len = max([ len(s) for s in DAYS])
                                        print(td.text + ''.rjust(max_day_len - len(text)) + ': ', end=' ')
                                    else:
                                        hour_list.append(td.text)
                                        print(td.text, end=' ')
                                # actual lecture
                                elif cl == 'object-cell-border':
                                    #print('HODINA')
                                    hours, lecture = parse_lecture(td)
                                    for i in range(hours):
                                        #print('day_no= {}, multirow_no= {}'.format(day_no, multirow_no))
                                        sh[day_no - multirow_no].append('X')
                                else:
                                    # empty
                                    #print('day_no= {}, multirow_no= {}'.format(day_no, multirow_no))
                                    sh[day_no - multirow_no].append('O')
                            else:
                                print('       ')
                        print()
                        if multirow:
                            multirow_no += 1

    print()

    print_schedule(hour_list, sh)
Exemple #37
0
for record in f:
    # We convert into UTF8 first of all
    orig_encoding, text = convert_encoding(record.payload.read())
    url = record.url

    if orig_encoding is None:
        logging.info("Encoding of document " + url + " could not be identified")

    if len(text) > 0:
        # HTML is then normalized
        cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False)

        tree=""
        try:
            cleanhtml = cleaner.clean_html(re.sub('encoding *= *"[^"]+"', '', text, flags=re.IGNORECASE))
            document = html5lib.parse(ftfy.fix_text(cleanhtml), treebuilder="lxml", namespaceHTMLElements=False)
            tree = etree.tostring(document)
        except:
            continue

        tree = etree.tostring(document)
        cleantree = tree.decode("utf8").replace("&#160;", " ")
        cleantree = cleantree.replace("\t", " ")

        # lang id
        lang = guess_lang_from_data2(cleantree)
        if len(languages) > 0 and lang not in languages:
            logging.info("Language of document " + url + ": " + lang + ". Not among searched languages.")
        else:
            # If enabled, remove boilerplate HTML
            if options.boilerpipe:
Exemple #38
0
def parse_headlines_s(s):
    root = html5lib.parse(s, default_treebuilder)
    return parse_headlines(root)
import html5lib, sys
from xml.etree import ElementTree as ET 


tree = html5lib.parse(sys.stdin.read().decode("utf-8"), namespaceHTMLElements=False)

def filtertext(t):
    if t == None:
        return None
    return t.replace("\n", " ")

# STOP = "script pre code style".split()
PROCESS = "p sub li a em".split()

def process (elt):
    # if elt.tag in STOP:
    #     return
    # print elt.tag
    if elt.tag in PROCESS:
        elt.text = filtertext(elt.text)
        elt.tail = filtertext(elt.tail)
    for child in elt:
        process(child)

process(tree)
sys.stdout.write(ET.tostring(tree).encode("utf-8"))

Exemple #40
0
def cmd_login(user, passwd):
    """Attempts to log into GOG and saves the resulting cookiejar to disk.
    """
    login_data = {
        'user': user,
        'passwd': passwd,
        'auth_url': None,
        'login_token': None,
        'two_step_url': None,
        'two_step_token': None,
        'two_step_security_code': None,
        'login_success': False,
    }

    global_cookies.clear()  # reset cookiejar

    # prompt for login/password if needed
    if login_data['user'] is None:
        login_data['user'] = input("Username: "******"attempting gog login as '{}' ...".format(login_data['user']))

    # fetch the auth url
    with request(GOG_HOME_URL, delay=0) as page:
        etree = html5lib.parse(page, namespaceHTMLElements=False)
        for elm in etree.findall('.//script'):
            if elm.text is not None and 'GalaxyAccounts' in elm.text:
                login_data['auth_url'] = elm.text.split("'")[3]
                break

    # fetch the login token
    with request(login_data['auth_url'], delay=0) as page:
        etree = html5lib.parse(page, namespaceHTMLElements=False)
        # Bail if we find a request for a reCAPTCHA
        if len(etree.findall(
                './/div[@class="g-recaptcha form__recaptcha"]')) > 0:
            error(
                "cannot continue, gog is asking for a reCAPTCHA :(  try again in a few minutes."
            )
            return
        for elm in etree.findall('.//input'):
            if elm.attrib['id'] == 'login__token':
                login_data['login_token'] = elm.attrib['value']
                break

    # perform login and capture two-step token if required
    with request(GOG_LOGIN_URL,
                 delay=0,
                 args={
                     'login[username]': login_data['user'],
                     'login[password]': login_data['passwd'],
                     'login[login]': '',
                     'login[_token]': login_data['login_token']
                 }) as page:
        etree = html5lib.parse(page, namespaceHTMLElements=False)
        if 'two_step' in page.geturl():
            login_data['two_step_url'] = page.geturl()
            for elm in etree.findall('.//input'):
                if elm.attrib['id'] == 'second_step_authentication__token':
                    login_data['two_step_token'] = elm.attrib['value']
                    break
        elif 'on_login_success' in page.geturl():
            login_data['login_success'] = True

    # perform two-step if needed
    if login_data['two_step_url'] is not None:
        login_data['two_step_security_code'] = input(
            "enter two-step security code: ")

        # Send the security code back to GOG
        with request(login_data['two_step_url'],
                     delay=0,
                     args={
                         'second_step_authentication[token][letter_1]':
                         login_data['two_step_security_code'][0],
                         'second_step_authentication[token][letter_2]':
                         login_data['two_step_security_code'][1],
                         'second_step_authentication[token][letter_3]':
                         login_data['two_step_security_code'][2],
                         'second_step_authentication[token][letter_4]':
                         login_data['two_step_security_code'][3],
                         'second_step_authentication[send]':
                         "",
                         'second_step_authentication[_token]':
                         login_data['two_step_token']
                     }) as page:
            if 'on_login_success' in page.geturl():
                login_data['login_success'] = True

    # save cookies on success
    if login_data['login_success']:
        info('login successful!')
        global_cookies.save()
    else:
        error('login failed, verify your username/password and try again.')
Exemple #41
0
def html5libparse(url, f):
    'html5libparse() - use html5lib to parse anchor tags'
    output(urljoin(url, x.attributes['href'])\
    for x in parse(f) if isinstance(x, treebuilders.simpletree.Element) and \ 
    x.name == 'a')
Exemple #42
0
    def identify(self, log, result_queue, abort, title=None, authors=None,  # {{{
            identifiers={}, timeout=30):
        '''
        Note this method will retry without identifiers automatically if no
        match is found with identifiers.
        '''
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.chardet import xml_to_unicode
        from lxml.html import tostring
        import html5lib

        testing = getattr(self, 'running_a_test', False)

        query, domain = self.create_query(log, title=title, authors=authors,
                identifiers=identifiers)
        if query is None:
            log.error('Insufficient metadata to construct query')
            return
        br = self.browser
        if testing:
            print ('Using user agent for amazon: %s'%self.user_agent)
        try:
            raw = br.open_novisit(query, timeout=timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                log.error('Query malformed: %r'%query)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = _('Amazon timed out. Try again later.')
                log.error(msg)
            else:
                msg = 'Failed to make identify query: %r'%query
                log.exception(msg)
            return as_unicode(msg)

        raw = clean_ascii_chars(xml_to_unicode(raw,
            strip_encoding_pats=True, resolve_entities=True)[0])

        if testing:
            import tempfile
            with tempfile.NamedTemporaryFile(prefix='amazon_results_',
                    suffix='.html', delete=False) as f:
                f.write(raw.encode('utf-8'))
            print ('Downloaded html for results page saved in', f.name)

        matches = []
        found = '<title>404 - ' not in raw

        if found:
            try:
                root = html5lib.parse(raw, treebuilder='lxml',
                        namespaceHTMLElements=False)
            except:
                msg = 'Failed to parse amazon page for query: %r'%query
                log.exception(msg)
                return msg

                errmsg = root.xpath('//*[@id="errorMessage"]')
                if errmsg:
                    msg = tostring(errmsg, method='text', encoding=unicode).strip()
                    log.error(msg)
                    # The error is almost always a not found error
                    found = False

        if found:
            matches = self.parse_results_page(root, domain)

        if abort.is_set():
            return

        if not matches:
            if identifiers and title and authors:
                log('No matches found with identifiers, retrying using only'
                        ' title and authors. Query: %r'%query)
                return self.identify(log, result_queue, abort, title=title,
                        authors=authors, timeout=timeout)
            log.error('No matches found with query: %r'%query)
            return

        workers = [Worker(url, result_queue, br, log, i, domain, self,
                            testing=testing) for i, url in enumerate(matches)]

        for w in workers:
            w.start()
            # Don't send all requests at the same time
            time.sleep(0.1)

        while not abort.is_set():
            a_worker_is_alive = False
            for w in workers:
                w.join(0.2)
                if abort.is_set():
                    break
                if w.is_alive():
                    a_worker_is_alive = True
            if not a_worker_is_alive:
                break

        return None
Exemple #43
0
def parse(path='html5ents.xml'):
    return html5lib.parse(open(path), treebuilder='lxml')
Exemple #44
0
 def test_html5_parser(self):
     from html5_parser import parse
     parse('<p>xxx')
Exemple #45
0
def parse_email_html(html_data):
    etree_document = html5lib.parse(html_data,
                                    treebuilder="lxml",
                                    namespaceHTMLElements=False)
    root = etree_document.getroot()

    # By inspecting HTML payloads (saved/dumped elsewhere),
    # (samples taken at points which the scraping threw an exception!),
    #
    # It's clear that the format in the emails is close-enough that it's easier
    # to write a flexible scraper, than to scrape scrictly for slight variations.
    #
    # Emails after 2014-Aug (ish) change from:
    #  <td>BookTitle <span>By AuthorName</span></td>
    # to:
    #  <td><a>$BookTitle</a> <span>By $AuthorName</span></td>
    # Additionally, emails after 2014-Aug (ish) no longer include
    # a <td>$DateOfPurchase</td>, so, this changes xpath of $Price <td/>
    #
    # Emails after 2015-Aug (ish) change the specific xpath to the items table.
    #
    # Edge case in my emails is an email (before 2014-Aug) with no <span/>,
    #  & so no author. Okay.

    # General formula was
    #   some_xpath = "/path/to/el"
    #   some_el = root.xpath(some_xpath) # n.b. this is a list.
    #   some_str = f(some_el) # some_el[0].text, etc.

    # TBH, most of the rest is "magic"/hard-coded enough (by nature)
    # that it's not particularly maintainable.
    # Scrapers should be fragile.

    # items_table contains all the <tr/> with order items.
    # items_table_xpath = "/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td[2]/table[3]/tbody/tr/td[2]/table[3]/tbody/tr/td/table[4]"
    items_table_xpath = "/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td[2]/table/tbody/tr/td[2]/table[3]/tbody/tr/td/table[4]"
    items_table = root.xpath(items_table_xpath)[0]

    # "/tbody..." vs "tbody..."?
    item_rows = items_table.xpath("tbody/tr")

    # print "DEBUG Num item rows: ", len(item_rows)

    # For individual <tr/>, return { title, author, price }
    def item_from_row(tr):
        # Because it's email, the <tr/> has a table or two inside it. Cool.
        title_author_td = tr.xpath("td/table/tbody/tr/td/table/tbody/tr/td[2]")

        # print "DEBUG Title Author TD len", len(title_author_td)

        # How to do things like ".getElementsByTag"? :S
        # Prefer BeautifulSoup for some things?

        a = title_author_td[0].xpath("a")

        if len(a) == 0:
            title = title_author_td[0].text
        else:
            title = a[0].text

        # print "DEBUG Title", title

        span = title_author_td[0].xpath("span")

        if len(span) > 0:
            # Get rid of the "By.."
            author = " ".join(span[0].text.split()[1:])
        else:
            author = None

        # print "DEBUG author ", author

        # Price <td/> is the last one.
        price_td = tr.xpath("td/table/tbody/tr/td/table/tbody/tr/td")[-1]
        price = price_td.text

        print "DEBUG Kobo found '%s' by '%s' @ '%s'" % (title, author, price)

        return {"title": title, "author": author, "price": price}

    return [item_from_row(r) for r in item_rows]
Exemple #46
0
def filterHtml (selectFunc, fd):
    document = html5lib.parse (fd)
    walker = html5lib.getTreeWalker("etree")
    stream = walker (document)
    s = HTMLSerializer()
    yield ''.join (s.serialize(Select (stream, selectFunc)))
Exemple #47
0
def thread(data, default="Untitled.", id=None):
    """
    Extract <h1> title from web page. The title is *probably* the text node,
    which is the nearest H1 node in context to an element with the `isso-thread` id.
    """

    html = html5lib.parse(data, treebuilder="dom")

    assert html.lastChild.nodeName == "html"
    html = html.lastChild

    # aka getElementById, but limited to div and section tags
    el = list(
        filter(
            lambda i: i.attributes["id"].value == "isso-thread",
            filter(
                lambda i: "id" in i.attributes,
                chain(*map(html.getElementsByTagName, ("div", "section"))))))

    if not el:
        return id, default

    el = el[0]
    visited = []

    def recurse(node):
        for child in node.childNodes:
            if child.nodeType != child.ELEMENT_NODE:
                continue
            if child.nodeName.upper() == "H1":
                return child
            if child not in visited:
                return recurse(child)

    def gettext(rv):
        for child in rv.childNodes:
            if child.nodeType == child.TEXT_NODE:
                yield child.nodeValue
            if child.nodeType == child.ELEMENT_NODE:
                for item in gettext(child):
                    yield item

    try:
        id = unquote(el.attributes["data-isso-id"].value)
    except (KeyError, AttributeError):
        pass

    try:
        return id, unquote(el.attributes["data-title"].value)
    except (KeyError, AttributeError):
        pass

    while el is not None:  # el.parentNode is None in the very end

        visited.append(el)
        rv = recurse(el)

        if rv:
            return id, ''.join(gettext(rv)).strip()

        el = el.parentNode

    return id, default
def parse(doc):
	return html5lib.parse(doc, namespaceHTMLElements=False)
Exemple #49
0
def parse(path="html5ents.xml"):
    return html5lib.parse(open(path), treebuilder="lxml")
Exemple #50
0
from collections import OrderedDict

# This script scrapes data from BioPKU's PAHvdb.

# There are 2 cookies for the page. This one is the one that is required for the script to work.
cookies = {'BIOPKUCopyrightDisclaimer': '1'}

# This has to be a post request. Can't do requests.get(URL).
r = requests.post(
    'http://biopku.org/pah/search-results-browse.asp',
    data={'searchType': '2'}, # This represents clicking search and browse to get to the list of all variants in the database
    cookies=cookies
)
doc = html5lib.parse(
    r.text, #input HTML
    treebuilder='lxml', #enable xpath function
    namespaceHTMLElements=False #disable namespace prefixes
)
links = doc.xpath('.//div[@id="container-body-wide"]//td[position()=4]//a')
#links = ['result-details-pah.asp?ID=689', 'result-details-pah.asp?ID=623','result-details-pah.asp?ID=622'] #for testing purposes
#links = ['result-details-pah.asp?ID=692', 'result-details-pah.asp?ID=693', 'result-details-pah.asp?ID=694', 'result-details-pah.asp?ID=733']

# Used https://pythex.org/ to check my regex
link_match = re.compile("/centralstore/pah/[a-zA-Z\.\d_\-()+]*_PAH.htm[l]?")

for link in links:
    r = requests.get('http://biopku.org/pah/' + link.attrib['href'], cookies=cookies)
    #r = requests.get('http://biopku.org/pah/' + link, cookies=cookies)
    doc = html5lib.parse(r.text, treebuilder='lxml', namespaceHTMLElements=False)
    rows = doc.xpath('.//div[@id="right-body"]//td')
    #print rows #this is blank [] for empty pages
Exemple #51
0
class SourceFile(object):
    parsers = {
        "html": lambda x: html5lib.parse(x, treebuilder="etree"),
        "xhtml": lambda x: ElementTree.parse(x, XMLParser.XMLParser()),
        "svg": lambda x: ElementTree.parse(x, XMLParser.XMLParser())
    }

    root_dir_non_test = set(["common"])

    dir_non_test = set(["resources", "support", "tools"])

    dir_path_non_test = {("css21", "archive"), ("css", "CSS2", "archive"),
                         ("css", "common")}

    def __init__(self, tests_root, rel_path, url_base, contents=None):
        """Object representing a file in a source tree.

        :param tests_root: Path to the root of the source tree
        :param rel_path: File path relative to tests_root
        :param url_base: Base URL used when converting file paths to urls
        :param contents: Byte array of the contents of the file or ``None``.
        """

        self.tests_root = tests_root
        if os.name == "nt":
            # do slash normalization on Windows
            if isinstance(rel_path, binary_type):
                self.rel_path = rel_path.replace(b"/", b"\\")
            else:
                self.rel_path = rel_path.replace(u"/", u"\\")
        else:
            self.rel_path = rel_path
        self.url_base = url_base
        self.contents = contents

        self.dir_path, self.filename = os.path.split(self.rel_path)
        self.name, self.ext = os.path.splitext(self.filename)

        self.type_flag = None
        if "-" in self.name:
            self.type_flag = self.name.rsplit("-", 1)[1].split(".")[0]

        self.meta_flags = self.name.split(".")[1:]

        self.items_cache = None

    def __getstate__(self):
        # Remove computed properties if we pickle this class
        rv = self.__dict__.copy()

        if "__cached_properties__" in rv:
            cached_properties = rv["__cached_properties__"]
            for key in rv.keys():
                if key in cached_properties:
                    del rv[key]
            del rv["__cached_properties__"]
        return rv

    def name_prefix(self, prefix):
        """Check if the filename starts with a given prefix

        :param prefix: The prefix to check"""
        return self.name.startswith(prefix)

    def is_dir(self):
        """Return whether this file represents a directory."""
        if self.contents is not None:
            return False

        return os.path.isdir(self.rel_path)

    def open(self):
        """
        Return either
        * the contents specified in the constructor, if any;
        * a File object opened for reading the file contents.
        """

        if self.contents is not None:
            file_obj = ContextManagerBytesIO(self.contents)
        else:
            file_obj = open(self.path, 'rb')
        return file_obj

    @cached_property
    def path(self):
        return os.path.join(self.tests_root, self.rel_path)

    @cached_property
    def url(self):
        return rel_path_to_url(self.rel_path, self.url_base)

    @cached_property
    def hash(self):
        with self.open() as f:
            return hashlib.sha1(f.read()).hexdigest()

    def in_non_test_dir(self):
        if self.dir_path == "":
            return True

        parts = self.dir_path.split(os.path.sep)

        if (parts[0] in self.root_dir_non_test
                or any(item in self.dir_non_test for item in parts)
                or any(parts[:len(path)] == list(path)
                       for path in self.dir_path_non_test)):
            return True
        return False

    def in_conformance_checker_dir(self):
        return (self.dir_path == "conformance-checkers" or
                self.dir_path.startswith("conformance-checkers" + os.path.sep))

    @property
    def name_is_non_test(self):
        """Check if the file name matches the conditions for the file to
        be a non-test file"""
        return (self.is_dir() or self.name_prefix("MANIFEST")
                or self.filename.startswith(".") or self.type_flag == "support"
                or self.in_non_test_dir())

    @property
    def name_is_conformance(self):
        return (self.in_conformance_checker_dir()
                and self.type_flag in ("is-valid", "no-valid"))

    @property
    def name_is_conformance_support(self):
        return self.in_conformance_checker_dir()

    @property
    def name_is_stub(self):
        """Check if the file name matches the conditions for the file to
        be a stub file"""
        return self.name_prefix("stub-")

    @property
    def name_is_manual(self):
        """Check if the file name matches the conditions for the file to
        be a manual test file"""
        return self.type_flag == "manual"

    @property
    def name_is_visual(self):
        """Check if the file name matches the conditions for the file to
        be a visual test file"""
        return self.type_flag == "visual"

    @property
    def name_is_multi_global(self):
        """Check if the file name matches the conditions for the file to
        be a multi-global js test file"""
        return "any" in self.meta_flags and self.ext == ".js"

    @property
    def name_is_worker(self):
        """Check if the file name matches the conditions for the file to
        be a worker js test file"""
        return "worker" in self.meta_flags and self.ext == ".js"

    @property
    def name_is_window(self):
        """Check if the file name matches the conditions for the file to
        be a window js test file"""
        return "window" in self.meta_flags and self.ext == ".js"

    @property
    def name_is_webdriver(self):
        """Check if the file name matches the conditions for the file to
        be a webdriver spec test file"""
        # wdspec tests are in subdirectories of /webdriver excluding __init__.py
        # files.
        rel_dir_tree = self.rel_path.split(os.path.sep)
        return (((rel_dir_tree[0] == "webdriver" and len(rel_dir_tree) > 1) or
                 (rel_dir_tree[:2] == ["infrastructure", "webdriver"]
                  and len(rel_dir_tree) > 2))
                and self.filename not in ("__init__.py", "conftest.py")
                and fnmatch(self.filename, wd_pattern))

    @property
    def name_is_reference(self):
        """Check if the file name matches the conditions for the file to
        be a reference file (not a reftest)"""
        return "/reference/" in self.url or "/reftest/" in self.url or bool(
            reference_file_re.search(self.name))

    @property
    def markup_type(self):
        """Return the type of markup contained in a file, based on its extension,
        or None if it doesn't contain markup"""
        ext = self.ext

        if not ext:
            return None
        if ext[0] == ".":
            ext = ext[1:]
        if ext in ["html", "htm"]:
            return "html"
        if ext in ["xhtml", "xht", "xml"]:
            return "xhtml"
        if ext == "svg":
            return "svg"
        return None

    @cached_property
    def root(self):
        """Return an ElementTree Element for the root node of the file if it contains
        markup, or None if it does not"""
        if not self.markup_type:
            return None

        parser = self.parsers[self.markup_type]

        with self.open() as f:
            try:
                tree = parser(f)
            except Exception:
                return None

        if hasattr(tree, "getroot"):
            root = tree.getroot()
        else:
            root = tree

        return root

    @cached_property
    def timeout_nodes(self):
        """List of ElementTree Elements corresponding to nodes in a test that
        specify timeouts"""
        return self.root.findall(
            ".//{http://www.w3.org/1999/xhtml}meta[@name='timeout']")

    @cached_property
    def script_metadata(self):
        if self.name_is_worker or self.name_is_multi_global or self.name_is_window:
            regexp = js_meta_re
        elif self.name_is_webdriver:
            regexp = python_meta_re
        else:
            return None

        with self.open() as f:
            return list(read_script_metadata(f, regexp))

    @cached_property
    def timeout(self):
        """The timeout of a test or reference file. "long" if the file has an extended timeout
        or None otherwise"""
        if self.script_metadata:
            if any(m == (b"timeout", b"long") for m in self.script_metadata):
                return "long"

        if self.root is None:
            return None

        if self.timeout_nodes:
            timeout_str = self.timeout_nodes[0].attrib.get("content", None)
            if timeout_str and timeout_str.lower() == "long":
                return "long"

        return None

    @cached_property
    def viewport_nodes(self):
        """List of ElementTree Elements corresponding to nodes in a test that
        specify viewport sizes"""
        return self.root.findall(
            ".//{http://www.w3.org/1999/xhtml}meta[@name='viewport-size']")

    @cached_property
    def viewport_size(self):
        """The viewport size of a test or reference file"""
        if self.root is None:
            return None

        if not self.viewport_nodes:
            return None

        return self.viewport_nodes[0].attrib.get("content", None)

    @cached_property
    def dpi_nodes(self):
        """List of ElementTree Elements corresponding to nodes in a test that
        specify device pixel ratios"""
        return self.root.findall(
            ".//{http://www.w3.org/1999/xhtml}meta[@name='device-pixel-ratio']"
        )

    @cached_property
    def dpi(self):
        """The device pixel ratio of a test or reference file"""
        if self.root is None:
            return None

        if not self.dpi_nodes:
            return None

        return self.dpi_nodes[0].attrib.get("content", None)

    @cached_property
    def testharness_nodes(self):
        """List of ElementTree Elements corresponding to nodes representing a
        testharness.js script"""
        return self.root.findall(
            ".//{http://www.w3.org/1999/xhtml}script[@src='/resources/testharness.js']"
        )

    @cached_property
    def content_is_testharness(self):
        """Boolean indicating whether the file content represents a
        testharness.js test"""
        if self.root is None:
            return None
        return bool(self.testharness_nodes)

    @cached_property
    def variant_nodes(self):
        """List of ElementTree Elements corresponding to nodes representing a
        test variant"""
        return self.root.findall(
            ".//{http://www.w3.org/1999/xhtml}meta[@name='variant']")

    @cached_property
    def test_variants(self):
        rv = []
        if self.ext == ".js":
            for (key, value) in self.script_metadata:
                if key == b"variant":
                    rv.append(value.decode("utf-8"))
        else:
            for element in self.variant_nodes:
                if "content" in element.attrib:
                    variant = element.attrib["content"]
                    rv.append(variant)

        for variant in rv:
            assert variant == "" or variant[0] in ["#", "?"], variant

        if not rv:
            rv = [""]

        return rv

    @cached_property
    def testdriver_nodes(self):
        """List of ElementTree Elements corresponding to nodes representing a
        testdriver.js script"""
        return self.root.findall(
            ".//{http://www.w3.org/1999/xhtml}script[@src='/resources/testdriver.js']"
        )

    @cached_property
    def has_testdriver(self):
        """Boolean indicating whether the file content represents a
        testharness.js test"""
        if self.root is None:
            return None
        return bool(self.testdriver_nodes)

    @cached_property
    def reftest_nodes(self):
        """List of ElementTree Elements corresponding to nodes representing a
        to a reftest <link>"""
        if self.root is None:
            return []

        match_links = self.root.findall(
            ".//{http://www.w3.org/1999/xhtml}link[@rel='match']")
        mismatch_links = self.root.findall(
            ".//{http://www.w3.org/1999/xhtml}link[@rel='mismatch']")
        return match_links + mismatch_links

    @cached_property
    def references(self):
        """List of (ref_url, relation) tuples for any reftest references specified in
        the file"""
        rv = []
        rel_map = {"match": "==", "mismatch": "!="}
        for item in self.reftest_nodes:
            if "href" in item.attrib:
                ref_url = urljoin(self.url,
                                  item.attrib["href"].strip(space_chars))
                ref_type = rel_map[item.attrib["rel"]]
                rv.append((ref_url, ref_type))
        return rv

    @cached_property
    def content_is_ref_node(self):
        """Boolean indicating whether the file is a non-leaf node in a reftest
        graph (i.e. if it contains any <link rel=[mis]match>"""
        return bool(self.references)

    @cached_property
    def css_flag_nodes(self):
        """List of ElementTree Elements corresponding to nodes representing a
        flag <meta>"""
        if self.root is None:
            return []
        return self.root.findall(
            ".//{http://www.w3.org/1999/xhtml}meta[@name='flags']")

    @cached_property
    def css_flags(self):
        """Set of flags specified in the file"""
        rv = set()
        for item in self.css_flag_nodes:
            if "content" in item.attrib:
                for flag in item.attrib["content"].split():
                    rv.add(flag)
        return rv

    @cached_property
    def content_is_css_manual(self):
        """Boolean indicating whether the file content represents a
        CSS WG-style manual test"""
        if self.root is None:
            return None
        # return True if the intersection between the two sets is non-empty
        return bool(
            self.css_flags & {
                "animated", "font", "history", "interact", "paged", "speech",
                "userstyle"
            })

    @cached_property
    def spec_link_nodes(self):
        """List of ElementTree Elements corresponding to nodes representing a
        <link rel=help>, used to point to specs"""
        if self.root is None:
            return []
        return self.root.findall(
            ".//{http://www.w3.org/1999/xhtml}link[@rel='help']")

    @cached_property
    def spec_links(self):
        """Set of spec links specified in the file"""
        rv = set()
        for item in self.spec_link_nodes:
            if "href" in item.attrib:
                rv.add(item.attrib["href"].strip(space_chars))
        return rv

    @cached_property
    def content_is_css_visual(self):
        """Boolean indicating whether the file content represents a
        CSS WG-style visual test"""
        if self.root is None:
            return None
        return bool(
            self.ext in {'.xht', '.html', '.xhtml', '.htm', '.xml', '.svg'}
            and self.spec_links)

    @property
    def type(self):
        rv, _ = self.manifest_items()
        return rv

    def manifest_items(self):
        """List of manifest items corresponding to the file. There is typically one
        per test, but in the case of reftests a node may have corresponding manifest
        items without being a test itself."""

        if self.items_cache:
            return self.items_cache

        if self.name_is_non_test:
            rv = "support", [SupportFile(self)]

        elif self.name_is_stub:
            rv = Stub.item_type, [Stub(self, self.url)]

        elif self.name_is_manual:
            rv = ManualTest.item_type, [ManualTest(self, self.url)]

        elif self.name_is_conformance:
            rv = ConformanceCheckerTest.item_type, [
                ConformanceCheckerTest(self, self.url)
            ]

        elif self.name_is_conformance_support:
            rv = "support", [SupportFile(self)]

        elif self.name_is_visual:
            rv = VisualTest.item_type, [VisualTest(self, self.url)]

        elif self.name_is_multi_global:
            globals = b""
            for (key, value) in self.script_metadata:
                if key == b"global":
                    globals = value
                    break

            tests = [
                TestharnessTest(self,
                                global_variant_url(self.url, suffix) + variant,
                                timeout=self.timeout)
                for suffix in sorted(global_suffixes(globals))
                for variant in self.test_variants
            ]
            rv = TestharnessTest.item_type, tests

        elif self.name_is_worker:
            test_url = replace_end(self.url, ".worker.js", ".worker.html")
            tests = [
                TestharnessTest(self, test_url + variant, timeout=self.timeout)
                for variant in self.test_variants
            ]
            rv = TestharnessTest.item_type, tests

        elif self.name_is_window:
            test_url = replace_end(self.url, ".window.js", ".window.html")
            tests = [
                TestharnessTest(self, test_url + variant, timeout=self.timeout)
                for variant in self.test_variants
            ]
            rv = TestharnessTest.item_type, tests

        elif self.name_is_webdriver:
            rv = WebdriverSpecTest.item_type, [
                WebdriverSpecTest(self, self.url, timeout=self.timeout)
            ]

        elif self.content_is_css_manual and not self.name_is_reference:
            rv = ManualTest.item_type, [ManualTest(self, self.url)]

        elif self.content_is_testharness:
            rv = TestharnessTest.item_type, []
            testdriver = self.has_testdriver
            for variant in self.test_variants:
                url = self.url + variant
                rv[1].append(
                    TestharnessTest(self,
                                    url,
                                    timeout=self.timeout,
                                    testdriver=testdriver))

        elif self.content_is_ref_node:
            rv = (RefTestNode.item_type, [
                RefTestNode(self,
                            self.url,
                            self.references,
                            timeout=self.timeout,
                            viewport_size=self.viewport_size,
                            dpi=self.dpi)
            ])

        elif self.content_is_css_visual and not self.name_is_reference:
            rv = VisualTest.item_type, [VisualTest(self, self.url)]

        else:
            rv = "support", [SupportFile(self)]

        self.items_cache = rv

        return rv
Exemple #52
0
    def __init__(self, url: str, content: str) -> None:
        super().__init__(url=url)

        self._parsed = html5lib.parse(content, namespaceHTMLElements=False)
Exemple #53
0
def parseDocument(text):
    doc = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False)
    return doc
Exemple #54
0
def html5_parse(data, max_nesting_depth=100):
    import html5lib
    # html5lib bug: http://code.google.com/p/html5lib/issues/detail?id=195
    data = re.sub(r'<\s*title\s*[^>]*/\s*>', '<title></title>', data)

    data = html5lib.parse(data, treebuilder='lxml').getroot()

    # Check that the asinine HTML 5 algorithm did not result in a tree with
    # insane nesting depths
    for x in data.iterdescendants():
        if isinstance(x.tag, basestring) and len(x) is 0:  # Leaf node
            depth = node_depth(x)
            if depth > max_nesting_depth:
                raise ValueError('html5lib resulted in a tree with nesting'
                                 ' depth > %d' % max_nesting_depth)
    # Set lang correctly
    xl = data.attrib.pop('xmlU0003Alang', None)
    if xl is not None and 'lang' not in data.attrib:
        data.attrib['lang'] = xl

    # html5lib has the most inelegant handling of namespaces I have ever seen
    # Try to reconstitute destroyed namespace info
    xmlns_declaration = '{%s}' % XMLNS_NS
    non_html5_namespaces = {}
    seen_namespaces = set()
    for elem in tuple(data.iter(tag=etree.Element)):
        elem.attrib.pop('xmlns', None)
        namespaces = {}
        for x in tuple(elem.attrib):
            if x.startswith('xmlnsU') or x.startswith(xmlns_declaration):
                # A namespace declaration
                val = elem.attrib.pop(x)
                if x.startswith('xmlnsU0003A'):
                    prefix = x[11:]
                    namespaces[prefix] = val

        if namespaces:
            # Some destroyed namespace declarations were found
            p = elem.getparent()
            if p is None:
                # We handle the root node later
                non_html5_namespaces = namespaces
            else:
                idx = p.index(elem)
                p.remove(elem)
                elem = clone_element(elem, nsmap=namespaces)
                p.insert(idx, elem)

        b = barename(elem.tag)
        idx = b.find('U0003A')
        if idx > -1:
            prefix, tag = b[:idx], b[idx + 6:]
            ns = elem.nsmap.get(prefix, None)
            if ns is None:
                ns = non_html5_namespaces.get(prefix, None)
            if ns is not None:
                elem.tag = '{%s}%s' % (ns, tag)

        for b in tuple(elem.attrib):
            idx = b.find('U0003A')
            if idx > -1:
                prefix, tag = b[:idx], b[idx + 6:]
                ns = elem.nsmap.get(prefix, None)
                if ns is None:
                    ns = non_html5_namespaces.get(prefix, None)
                if ns is not None:
                    elem.attrib['{%s}%s' % (ns, tag)] = elem.attrib.pop(b)

        seen_namespaces |= set(elem.nsmap.itervalues())

    nsmap = dict(html5lib.constants.namespaces)
    nsmap[None] = nsmap.pop('html')
    non_html5_namespaces.update(nsmap)
    nsmap = non_html5_namespaces

    data = clone_element(data, nsmap=nsmap, in_context=False)

    # Remove unused namespace declarations
    fnsmap = {
        k: v
        for k, v in nsmap.iteritems() if v in seen_namespaces and v != XMLNS_NS
    }
    return clone_element(data, nsmap=fnsmap, in_context=False)
Exemple #55
0
 def __init__(self, content, url, headers=None, trusted=None):
     self.content = content
     self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
     self.url = url
     self.headers = headers
     self.trusted = trusted
Exemple #56
0
    def __init__(self, parent, filename, test_type):
        self.url = parent.session.config.server.url(filename)
        self.type = test_type
        self.variants = []
        # Some tests are reliant on the WPT servers substitution functionality,
        # so tests must be retrieved from the server rather than read from the
        # file system directly.
        handle = urllib.request.urlopen(
            self.url, context=parent.session.config.ssl_context)
        try:
            markup = handle.read()
        finally:
            handle.close()

        if test_type not in TEST_TYPES:
            raise ValueError('Unrecognized test type: "%s"' % test_type)

        parsed = html5lib.parse(markup, namespaceHTMLElements=False)
        name = None
        includes_variants_script = False
        self.expected = None

        for element in parsed.getiterator():
            if not name and element.tag == 'title':
                name = element.text
                continue
            if element.tag == 'meta' and element.attrib.get(
                    'name') == 'variant':
                self.variants.append(element.attrib.get('content'))
                continue
            if element.tag == 'script':
                if element.attrib.get('id') == 'expected':
                    try:
                        self.expected = json.loads(text_type(element.text))
                    except ValueError:
                        print("Failed parsing JSON in %s" % filename)
                        raise

                src = element.attrib.get('src', '')

                if 'variants.js' in src:
                    includes_variants_script = True
                    if not resolve_uri(filename, src):
                        raise ValueError(
                            'Could not resolve path "%s" from %s' %
                            (src, filename))

        if not name:
            raise ValueError('No name found in %s add a <title> element' %
                             filename)
        elif self.type == 'functional':
            if not self.expected:
                raise ValueError(
                    'Functional tests must specify expected report data')
            if not includes_variants_script:
                raise ValueError(
                    'No variants script found in file %s add '
                    '\'<script src="../../variants.js"></script>\'' % filename)
            if len(self.variants) == 0:
                self.variants = DEFAULT_VARIANTS
        elif self.type == 'unit' and self.expected:
            raise ValueError(
                'Unit tests must not specify expected report data')

        # Ensure that distinct items have distinct fspath attributes.
        # This is necessary because pytest has an internal cache keyed on it,
        # and only the first test with any given fspath will be run.
        #
        # This cannot use super(HTMLItem, self).__init__(..) because only the
        # Collector constructor takes the fspath argument.
        pytest.Item.__init__(self, name, parent)
        pytest.Collector.__init__(self,
                                  name,
                                  parent,
                                  fspath=py.path.local(filename))
Exemple #57
0
def htmlparse(t, encoding=None):
    return html5lib.parse(t, treebuilder='lxml', namespaceHTMLElements=False)
Exemple #58
0
def parse_html(html: str) -> ET.Element:
    '''Parse html and return an element tree, removing namespace.'''

    doc = html5lib.parse(html)
    doc = remove_namespace(doc)
    return doc
Exemple #59
0
def _parse_html(f):
    # type: (BinaryIO) -> ElementTree.ElementTree
    doc = html5lib.parse(f, treebuilder="etree", useChardet=False)
    if MYPY:
        return cast(ElementTree.ElementTree, doc)
    return doc
Exemple #60
0
def get_player_stats(team_url, season, league_name, results_array, goalie_results_array):
    if results_array is None:
        results_array = []

    if len(results_array) == 0:
        results_array.append(['Name', 'Position', 'Season', 'League',
                              'Team', 'GP', 'G', 'A', 'TP', 'PIM', '+/-', 'ID'])

    if goalie_results_array is None:
        goalie_results_array = []

    if len(goalie_results_array) == 0:
        goalie_results_array.append(
            ['Name', 'Season', 'League', 'Team', 'GP', 'GAA', 'SV%', 'ID'])

    team_search_request = requests.get(team_url + '?tab=stats#players')
    team_page = html5lib.parse(team_search_request.text)

    team_name = team_page.find('.//*[@id="name-and-logo"]/{0}h1'.format(helpers.html_prefix)).text.strip()

    player_table = team_page.find(
        './/*[@id="players"]/{0}div[1]/{0}div[4]/{0}table'.format(helpers.html_prefix))
    goalies_table = team_page.find(
        './/*[@id="players"]/{0}div[2]/{0}div[2]/{0}table'.format(helpers.html_prefix))

    players_grouped = helpers.get_ep_table_rows(player_table)
    goalies_grouped = helpers.get_ep_table_rows(goalies_table)

    for group in players_grouped:
        for player in group:
            player_stats = player.findall(
                './/{}td'.format(helpers.html_prefix))

            name_link = player_stats[NAME].find(
                './{0}span/{0}a'.format(helpers.html_prefix))
            name, position = helpers.get_info_from_player_name(name_link.text)
            id = helpers.get_player_id_from_url(
                name_link.attrib['href'])
            games = player_stats[GAMES].text.strip()
            goals = player_stats[GOALS].text.strip()
            assists = player_stats[ASSISTS].text.strip()
            points = player_stats[POINTS].text.strip()
            pim = player_stats[PIM].text.strip()
            plusminus = player_stats[PLUSMINUS].text.strip()

            results_array.append([
                name,
                position,
                season,
                league_name,
                team_name,
                games,
                goals,
                assists,
                points,
                pim,
                plusminus,
                id,
            ])

    for goalie_group in goalies_grouped:
        for goalie in goalie_group:
            goalie_stats = goalie.findall('./{}td'.format(helpers.html_prefix))

            name_link = goalie_stats[GOALIE_NAME].find(
                './{0}a'.format(helpers.html_prefix))
            name = name_link.text.strip()
            id = helpers.get_player_id_from_url(
                name_link.attrib['href'])

            games = goalie_stats[GOALIE_GP].text.strip()
            gaa = goalie_stats[GOALIE_GAA].text.strip()
            svp = goalie_stats[GOALIE_SVP].text.strip()

            goalie_results_array.append([
                name,
                season,
                league_name,
                team_name,
                games,
                gaa,
                svp,
                id,
            ])