Esempi in Python per Element, esempi in Python per lxml.html.Element

Esempio n. 1

0

Mostra file

    def write_miss_to_html(self, name: str, url: str, msg: str,
                           html_doc: html.Element):

        s = html.Element("div")
        h = html.Element("h1")
        h.text = name
        s.append(h)

        m = html.Element("div")
        m.text = self.cache.read_date_time_str(name + ".html")
        s.append(m)

        m = html.Element("span")
        m.text = msg
        s.append(m)

        x = html.Element("br")
        s.append(x)
        a = html.Element("a")
        a.attrib["href"] = url
        a.text = url
        s.append(a)

        html_doc.append(s)
        html_doc.append(html.Element("hr"))

Esempio n. 2

0

Mostra file

File: feb_parser.py Progetto: lvapeab/feb-stats

    def parse_game_metadata(self, doc: Element) -> Dict[str, str]:
        # Parse data by id
        date = doc.xpath('//span[@id="fechaLabel"]')
        hour = doc.xpath('//span[@id="horaLabel"]')
        league = doc.xpath('//span[@id="paginaTitulo_ligaLabel"]')
        season = doc.xpath('//span[@id="paginaTitulo_temporadaLabel"]')
        home_team = doc.xpath('//a[@id="equipoLocalHyperLink"]')
        home_score = doc.xpath('//span[@id="resultadoLocalLabel"]')
        away_team = doc.xpath('//a[@id="equipoVisitanteHyperLink"]')
        away_score = doc.xpath('//span[@id="resultadoVisitanteLabel"]')

        main_referee = doc.xpath('//span[@id="arbitroPrincipalLabel"]')
        second_referee = doc.xpath('//span[@id="arbitroAuxiliarLabel"]')

        metadata_dict = {
            "date": self.parse_str(date[0].text_content()),
            "hour": self.parse_str(hour[0].text_content()),
            "league": self.parse_str(league[0].text_content()),
            "season": self.parse_str(season[0].text_content()),
            "home_team": self.parse_str(home_team[0].text_content()),
            "home_score": self.parse_str(home_score[0].text_content()),
            "away_team": self.parse_str(away_team[0].text_content()),
            "away_score": self.parse_str(away_score[0].text_content()),
            "main_referee": self.parse_str(main_referee[0].text_content()),
            "second_referee": self.parse_str(second_referee[0].text_content()),
        }

        return metadata_dict

Esempio n. 3

0

Mostra file

    def write_as_html(self, foutput, name: str, url: str,
                      tables: List[ContentTable], html_doc: html.Element):

        s = html.Element("div")
        h = html.Element("h1")
        h.text = name
        s.append(h)

        m = html.Element("div")
        m.text = self.cache.read_date_time_str(name + ".html")
        s.append(m)

        for t in tables:
            s.append(t.new_element)

        x = html.Element("br")
        s.append(x)
        a = html.Element("a")
        a.attrib["href"] = url
        a.text = url
        s.append(a)

        h = html.Element("html")
        h.append(html.Element("body"))
        h[0].append(deepcopy(s))
        foutput.write(html.tostring(h, pretty_print=True))

        html_doc.append(s)
        html_doc.append(html.Element("hr"))

Esempio n. 4

0

Mostra file

File: build.py Progetto: 4l1fe/4l1fe.github.io

    def _apply_font_icons(html):
        root = fromstring(wrap_unwrap_fake_tag(html))
        for element in root.iter('a'):
            resource = element.attrib.get('href')
            if not (resource and element.text):  # .text empty in anchors <a>
                continue

            # External link
            if resource.startswith('https://github.com'):
                icon_class = HTMLGen.EXTERNAL_LINK_GITHUB_ICON_CLASS
            elif resource.startswith('http'):
                icon_class = HTMLGen.EXTERNAL_LINK_ICON_CLASS
            # Anchor
            elif resource.startswith('#'):
                icon_class = HTMLGen.ANCHOR_LINK_ICON_CLASS
            # File
            elif any(map(resource.endswith, HTMLGen.EXTENSIONS_ICON_CLASSES_MAP.keys())):
                extension = resource.rsplit('.', 1)[-1]
                icon_class = HTMLGen.EXTENSIONS_ICON_CLASSES_MAP[extension]
            else:
                print('Unknown icon resource ', resource)
                continue

            # Element prototype
            span_element = Element('span', attrib={'class': 'iconify', 'data-icon': icon_class})
            span_element.tail = ' ' + element.text
            element.text = None
            element.insert(0, span_element)
        html = tostring(root)
        html = wrap_unwrap_fake_tag(html, wrap=False)
        return html

Esempio n. 5

0

Mostra file

File: 爬取Go的笔记.py Progetto: ztisdashen/python-tool

def parse_li(li: Element, path: str, index: int):
    """

    :param
    """
    prefix = "000" + str(index)
    prefix = prefix[len(prefix) - 2:]
    if check_li(li):
        tem_dir = li.xpath("./a/text()")
        tem_a_url = li.xpath("./a/@href")
        if tem_dir:
            a_href = urljoin(BASE_URL, tem_a_url[0])
            #  可以得到最内部的url，进行对应的解析了
            res_ = requests.get(a_href)
            content_ = res_.text.encode("ISO-8859-1").decode('utf-8')
            file_name = path + "/{}-".format(prefix) + tem_dir[0].strip() + ".md"
            run(content_, '//*[@id="book-search-results"]/div[1]/section/*', file_name,
                base_url="http://www.topgoer.com")
            # print(file_name, a_href)
        return
    else:
        # 创建文件夹
        a_title = li.xpath("./a/text()")
        full_path = ""
        if a_title:
            # 创建文件夹
            sub_path = a_title[0].strip()
            full_path = path + "/{}-".format(prefix) + sub_path
            if not os.path.exists(full_path):
                os.makedirs(full_path)
        li_list_ = li.xpath("./ul/li")
        for i_ in range(len(li_list_)):
            parse_li(li_list_[i_], full_path, i_)

Esempio n. 6

0

Mostra file

File: patches.py Progetto: Siyavula/emas.mobiletheme

def process_img(self, doc, el):
    """ Process <img> tag in the source document.
    """
    self.add_alt_tags(el)

    # Skip over images with the nomobileresize attribute
    if el.attrib.pop("nomobileresize", "") != "":
        return

    src = el.attrib.get("src", None)
    if src:
        originalSrc = src
        site = getSite()
        # catch exceptions to ensure broken images don't
        # prevent the page from rendering 
        try:
            src = self.rewrite(src)
            shorturl = getUtility(IMobileImageShortURLStorage)
            key = shorturl.getkey(src)
            if key is None:
                key = shorturl.suggest()
                # just check that suggest() is working as expected
                assert shorturl.get(key) is None
                shorturl.add(key, src)
            src = '%s/@@shortimageurl/%s' % (site.absolute_url(), key)
            el.attrib["src"] = src
        except:
            # blank alt text
            del el.attrib["alt"]
            el.attrib["src"] = src
            error = ['src: %s' % src,
                     'URL: %s' % site.REQUEST.URL,
                     'Referer: %s' % site.REQUEST.HTTP_REFERER,
                     'User Agent: %s' % site.REQUEST.get('HTTP_USER_AGENT', 
                                                         'Unknown'),
                     traceback.format_exc()]
            # Stop logging image processing errors, it creates
            # unnecessary noise in the error log
            # error = '\n'.join(error)
            # LOG.info(error)
        
        # Make image clickable and point to original src
        a = Element('a')
        a.attrib['href'] = originalSrc
        el.getparent().replace(el, a)
        a.append(el)

        # Remove explicit width declarations
        if "width" in el.attrib:            
            del el.attrib["width"]

        if "height" in el.attrib:            
            del el.attrib["height"]
        
    if self.needs_clearing(el):
        self.clear_floats(el)
    
    self.add_processed_class(el)

Esempio n. 7

0

Mostra file

File: xml.py Progetto: brabadu/pml

def _dumps_xml_from_pml_nodes(root_node):
    node_name, attributes, sub_nodes = root_node

    element = Element(node_name, **attributes)

    for sub_node in sub_nodes:
        element.append(_dumps_xml_from_pml_nodes(sub_node))

    return element

Esempio n. 8

0

Mostra file

File: html_formater.py Progetto: wbing520/covid-data-pipeline

 def _indent_elem(self, elem: html.Element, depth: int):
     if len(elem) > 0:
         elem.text = self._indent_text(elem.text, depth + 1)
         for ch in elem:
             self._indent_elem(ch, depth + 1)
         elem[-1].tail = self._indent_text(elem.tail, depth)
         elem.tail = self._indent_text(elem.tail, depth)
     else:
         elem.text = self._indent_text(elem.text, 0)
         elem.tail = self._indent_text(elem.tail, depth)

Esempio n. 9

0

Mostra file

def get_user(username, rank):
    if rank is None:
        element = Element('span')
    else:
        element = Element('a', {
            'class': rank,
            'href': reverse('user_page', args=[username])
        })
    element.text = username
    return element

Esempio n. 10

0

Mostra file

File: htmlutils.py Progetto: vijayaraju/everyblock-1

def brs_to_paragraphs(tree, inline_tags=None):
    """
    Return an lxml tree with all <br> elements stripped and paragraphs put in
    place where necessary.
    """
    # add these tags to p's that we're currently building, any other tags will
    # close the current p
    inline_tags = inline_tags or ['a']

    # if this tree doesn't have any child elements, just return it as is
    if len(tree) == 0:
        return tree

    # if this tree doesn't contain any <br> tags, we don't need to touch it
    if tree.find('.//br') is None:
        return tree

    # XXX: We're building a whole new tree here and leaving out any attributes.
    # A) That might be a little slower and more memory intensive than modifying
    # the tree in place, and B) we're dropping any attributes on block elements.
    # The latter is probably fine for current use, but certainly not ideal.
    new_tree = Element(tree.tag)

    # if this tree starts out with text, create a new paragraph for it, and
    # add it to the tree
    if tree.text:
        p = E.P()
        p.text = tree.text
        new_tree.append(p)

    for e in tree:
        if e.tag == 'br':
            # avoid adding empty p elements
            if e.tail is None:
                continue
            # start a new p
            p = E.P()
            p.text = e.tail
            new_tree.append(p)
        # if this is a block tag, and it has trailing text, that text needs to
        # go into a new paragraph... only if the tail has actual content and
        # not just whitespace though.
        elif e.tail and re.match('[^\s]', e.tail) and e.tag not in inline_tags:
            p = E.P()
            p.text = e.tail
            e.tail = ''
            new_tree.append(e)
            new_tree.append(p)
        # keep inline tags inside the current paragraph
        elif e.tag in inline_tags:
            p.append(e)
        else:
            new_tree.append(brs_to_paragraphs(e))

    return new_tree

Esempio n. 11

0

Mostra file

def add_bootstrap_in_html_header(input_filename, output_filename):
    with open(input_filename) as fp:
        htmlstring = fp.read()
    bootstrap = Element("link")
    bootstrap.attrib["rel"] = "stylesheet"
    bootstrap.attrib["href"] = "static/css/bootstrap.min.css"
    bootstrap.attrib["type"] = "text/css"
    html = lhtml.fromstring(htmlstring)
    html.head.append(bootstrap)
    with open(output_filename, "w") as fp:
        fp.write(lhtml.tostring(html, encoding=str))

Esempio n. 12

0

Mostra file

def parse_news_block(element: Element, basic_url: Str) -> Union[Dict, None]:
    payload = dict()
    try:
        payload["news_title"] = element.xpath(
            ".//span[@class='newslist__text-title']/text()")[0]
        payload["news_url"] = "".join([basic_url, element.xpath("./@href")[0]])
        payload["image_url"] = "".join(
            [basic_url, element.xpath(".//img/@src")[0]])
    except IndexError:
        return None
    return payload

Esempio n. 13

0

Mostra file

File: test_iterparser.py Progetto: rajatomar788/pywebcopy7

 def test_html_style_tag_css_import(self):
     source = Element('style')
     source.text = '@import url(#);'
     elements = list(links(source))
     self.assertEqual(len(elements), 1)
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, '#')
     self.assertEqual(pos, 12)
     self.assertEqual(attr, None)
     self.assertEqual(el.tag, 'style')
     self.assertEqual(el.attrib, {})

Esempio n. 14

0

Mostra file

File: reference.py Progetto: VNOI-Admin/OJ

def get_user(username, data):
    if not data:
        element = Element('span')
        element.text = username
        return element

    element = Element('span', {'class': Profile.get_user_css_class(*data)})
    link = Element('a', {'href': reverse('user_page', args=[username])})
    link.text = username
    element.append(link)
    return element

Esempio n. 15

0

Mostra file

    def indent_data_table(self, t: html.Element) -> html.Element:

        prefix = "\n      "
        xprefix = prefix + "  "
        t.text = xprefix
        t.tail = prefix
        for ch in t:
            if len(ch) > 0:
                self.indent_element(ch, 0, xprefix)
            ch.tail = xprefix
        t[-1].tail = prefix

Esempio n. 16

0

Mostra file

File: test_iterparser.py Progetto: rajatomar788/pywebcopy7

 def test_script_element_with_url_in_the_text(self):
     source = Element('script')
     source.text = 'var background = "url(\'image.jpg\')"'
     elements = list(links(source))
     self.assertEqual(len(elements), 1)
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, 'image.jpg')
     self.assertEqual(pos, 23)
     self.assertEqual(attr, None)
     self.assertEqual(el.tag, 'script')
     self.assertEqual(el.attrib, {})

Esempio n. 17

0

Mostra file

File: test_iterparser.py Progetto: rajatomar788/pywebcopy7

 def test_html_style_tag_css_url_with_altering_colons(self):
     source = Element('style')
     source.text = 'html {background: url("#\');}'
     elements = list(links(source))
     self.assertEqual(len(elements), 1)
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, '#')
     self.assertEqual(pos, 23)
     self.assertEqual(attr, None)
     self.assertEqual(el.tag, 'style')
     self.assertEqual(el.attrib, {})

Esempio n. 18

0

Mostra file

File: html_formater.py Progetto: wbing520/covid-data-pipeline

    def _inject_extra_elements(self, tree: html.Element, xurl: str):
        if xurl == None: return

        if len(tree) == 0 or tree[0].tag != "head":
            return

        base = tree.findall("base")
        if len(base) > 0: return

        base = html.Element("base")
        base.attrib["ref"] = xurl
        tree.insert(0, base)

Esempio n. 19

0

Mostra file

File: html_cleaner.py Progetto: wbing520/covid-data-pipeline

    def clean_element(self, elem: html.Element):

        tag = elem.tag
        if tag in [
                "script", "noscript", "style", "meta", "input", "iframe",
                "select", "link", "font"
        ]:
            elem.getparent().remove(elem)
            return
        if tag == etree.Comment:
            elem.getparent().remove(elem)
            return
        if tag == etree.ProcessingInstruction:
            elem.getparent().remove(elem)
            return

        if tag == "form":
            a = elem.attrib.get("action")
            if a != None: del elem.attrib["action"]
            x = elem.attrib.get("onsubmit")
            if x != None: del elem.attrib["onsubmit"]

        if tag == "a":
            href = elem.attrib.get("href")
            if href != None and (href.startswith("https://twitter.com")
                                 or href.startswith("http://twitter.com")
                                 or href.startswith("https://t.co")):
                self.remove_twitter_cluster(elem.getparent())

        if tag == "svg":
            while len(elem):
                del elem[0]
        else:
            for ch in elem:
                self.clean_element(ch)

        if tag in ["div", "span"]:
            if self.is_empty(elem):
                elem.getparent().remove(elem)
                return
            if self.mark_special_case(elem):
                return
        elif tag in ["a"]:
            if self.mark_special_case(elem):
                return

            # strip spaces from simple links
            if len(elem) > 0:
                elem[-1].tail = None
            elif elem.text != None:
                elem.text = elem.text.strip()

        self.clean_attributes(elem)

Esempio n. 20

0

Mostra file

File: script.py Progetto: TrevorMassey/mitmdjango

    def inject_script_tag(self, html):
        root = lxml.html.fromstring(html)
        if root is None:
            # Sometimes non-html sneaks through the header check
            return html
        with open('mask_headless.js') as f:
            content_js = f.read()
        script = Element("script")
        script.text = content_js

        root.insert(0, script)
        html = lxml.html.tostring(root, method="html").decode('utf-8')
        return html

Esempio n. 21

0

Mostra file

File: htmlutils.py Progetto: horshacktest/openblock

def brs_to_paragraphs(tree, inline_tags=None):
    """
    Return an lxml tree with all <br> elements stripped and paragraphs put in
    place where necessary.
    """
    # add these tags to p's that we're currently building, any other tags will
    # close the current p
    inline_tags = inline_tags or ["a"]

    # if this tree doesn't have any child elements, just return it as is
    if len(tree) == 0:
        return tree

    # if this tree doesn't contain any <br> tags, we don't need to touch it
    if tree.find(".//br") is None:
        return tree

    # XXX: We're building a whole new tree here and leaving out any attributes.
    # A) That might be a little slower and more memory intensive than modifying
    # the tree in place, and B) we're dropping any attributes on block elements.
    # The latter is probably fine for current use, but certainly not ideal.
    new_tree = Element(tree.tag)

    # if this tree starts out with text, create a new paragraph for it, and
    # add it to the tree
    if tree.text:
        p = E.P()
        p.text = tree.text
        new_tree.append(p)

    for e in tree:
        if e.tag == "br":
            # avoid adding empty p elements
            if e.tail is None:
                continue
            # start a new p
            p = E.P()
            p.text = e.tail
            new_tree.append(p)
        # if this is a block tag, and it has trailing text, that text needs to
        # go into a new paragraph... only if the tail has actual content and
        # not just whitespace though.
        elif e.tail and re.match("[^\s]", e.tail) and e.tag not in inline_tags:
            p = E.P()
            p.text = e.tail
            e.tail = ""
            new_tree.append(e)
            new_tree.append(p)
        # keep inline tags inside the current paragraph
        elif e.tag in inline_tags:
            p.append(e)
        else:
            new_tree.append(brs_to_paragraphs(e))

    return new_tree

Esempio n. 22

0

Mostra file

File: transcript.py Progetto: Balmung421/transcript

def wrap_set(dom, child_tag, parent_tag):
    """Wrap unbroken sets of elements in a parent container:
        - <li> in a <ul>
        - <tr> in a <table>
    """
    nxt = 0
    for e in dom.cssselect(child_tag):
        if nxt != e:
            box = Element(parent_tag)
            insert(box, e)
        box.append(e)
        nxt = parent(e).getnext()
        if nxt is None:
            nxt = e.getnext()

Esempio n. 23

0

Mostra file

def wrap_set(dom, child_tag, parent_tag):
    """Wrap unbroken sets of elements in a parent container:
        - <li> in a <ul>
        - <tr> in a <table>
    """
    nxt = 0
    for e in dom.cssselect(child_tag):
        if nxt != e:
            box = Element(parent_tag)
            insert(box, e)
        box.append(e)
        nxt = parent(e).getnext()
        if nxt is None:
            nxt = e.getnext()

Esempio n. 24

0

Mostra file

File: build.py Progetto: 4l1fe/4l1fe.github.io

 def _apply_headers_anchors(html: str) -> str:
     root_element = fromstring(wrap_unwrap_fake_tag(html))
     for element in root_element:
         if element.tag in HEADERS:
             id_ = make_header_id(element.text)
             a_element = Element('a', {'id': id_, 'href': f'#{id_}'})
             span_element = Element('span', attrib={'class': 'iconify',
                                                    'data-icon': HTMLGen.ANCHOR_LINK_ICON_CLASS})
             a_element.append(span_element)
             element.text += ' '
             element.insert(0, a_element)
     html = tostring(root_element)
     html = wrap_unwrap_fake_tag(html, wrap=False)
     return html

Esempio n. 25

0

Mostra file

    def _add_html_info_row(self, t: html.Element, label: str, val: str, cls: str = None):
        tr = html.Element("tr")

        td = html.Element("td")
        td.text = label
        if cls != None: td.attrib["class"] = cls
        tr.append(td)

        td = html.Element("td")
        td.text = val
        if cls != None: td.attrib["class"] = cls
        tr.append(td)

        tr.tail = "\n      "
        t.append(tr)

Esempio n. 26

0

Mostra file

File: html2md.py Progetto: ztisdashen/python-tool

def del_with_table(sub: Element, file: TextIO):
    col_num = 0
    col_name = sub.xpath("//thead/tr/th/text()")
    col_num = len(col_name)
    row_head = "| " + " | ".join(col_name) + " |\n"
    file.write(row_head)
    # print(row_head)
    tem_list = "| " + " | ".join([":-----:" for i in range(col_num)]) + " |\n"
    # print(tem_list)
    file.write(tem_list)
    # 解决表体
    trs = sub.xpath("//tbody/tr")
    for tr in trs:
        row = tr.xpath("./td/text()")
        row_each = "| " + " | ".join(row).replace("\n", "") + " |\n"
        file.write(row_each)

Esempio n. 27

0

Mostra file

File: html_cleaner.py Progetto: wbing520/covid-data-pipeline

    def mark_special_case(self, elem: html.Element) -> bool:
        " edit or return element to remove "
        # -- stupid special cases for CA
        #if elem.tag == "div":
        #    xid = elem.get("id")
        #    if xid == "DeltaPlaceHolderPageDescription" or xid == "DeltaPlaceHolderPageTitleInTitleArea":
        #        logger.debug("special case: remove deltaplaceholder")
        #        self.to_remove.append(elem.getparent())
        #        return True
        #elif elem.tag == "a":
        #    href = elem.get("href")
        #    if href == "#ctl00_ctl65_SkipLink":
        #        logger.debug("special case: remove skiplink")
        #        self.to_remove.append(elem.getparent())
        #        return True

        if elem.tag == "div":
            xid = elem.attrib.get("id")
            if xid == "google_translate_element" and len(elem) > 0:
                logger.debug("special case: google_translate_element")
                return elem[0]
            if xid != None:
                xid2 = re.sub("^[0-9a-fA-F]+-(.*)", "\\1", xid)
                xid2 = re.sub("(.*)-[a-z]?[0-9a-fA-F]+$", "\\1", xid2)
                if xid != xid2:
                    logger.debug("special case: hex data in id")
                    elem.attrib["id"] = xid2

            if elem.attrib.get("fb-xfbml-state"):
                logger.debug("special case: fb")
                return elem

        return False

Esempio n. 28

0

Mostra file

File: html2md.py Progetto: ztisdashen/python-tool

def del_with_ul(tag: Element, file: TextIO, space_num=0, prefix=None):
    """
    :param file: 文件
    :param prefix: 前缀
    :arg tag 包含ul的标签
    :arg space_num 空格个数控制格式
    tag的格式是
    """
    tag = etree.HTML(
        etree.tostring(tag, encoding="utf-8", pretty_print=True,
                       method="html").decode())
    li = tag.xpath("/html/body/ul/li")
    # 没有li标签
    if not li:
        return
    elif len(li) == 0:
        return
    else:
        for k in li:
            tem_ = k.xpath("./text()")
            if tem_:
                if not prefix:
                    line = " " * space_num + "* " + tem_[0].replace("\n",
                                                                    "") + "\n"
                    file.write(line)
                    # print(" " * space_num, "*", k.xpath("./text()")[0].replace("\n", ""))
                else:
                    line = prefix + " " + " " * space_num + "* " + tem_[
                        0].replace("\n", "") + "\n"
                    file.write(line)
                    # print(prefix, end="")
                    # print(" " * space_num, "*", k.xpath("./text()")[0].replace("\n", ""))
            tem = k.xpath("./ul")
            if tem is not None and len(tem) > 0:
                del_with_ul(tem[0], space_num=space_num + 1, file=file)

Esempio n. 29

0

Mostra file

def cssselect(node: Element, selector: str) -> Element:
    result = node.cssselect(selector)
    if len(result) != 1:
        raise Exception(
            f"Selector {str} on node {inner_html(node)} gave {len(result)} results, needed one"
        )
    return result[0]

Esempio n. 30

0

Mostra file

File: scrape_tips_for_treasurers.py Progetto: FECHoosier/fec-cms

def fix_urls(el: Element, base_url: str, broken_urls: List[str],
             urls_to_change: dict) -> Tuple[Callable, List[str]]:
    """
    Given an HTML element, turns all ``href`` parameters of ``a`` elements
    inside it into fully-qualified absolute URLs instead of the relative paths
    that are common in the tips content.

    :arg Element el: ``lxml.html.Element`` object, the content to change.
    :arg str base_url: The URL for the page, which serves as the absolute
        point with which to calculate the absolute paths.
    :arg list broken_urls: The list of broken URLs to add to as we find them.
    :arg dict[str, str] urls_to_change: Known broken URLs and their
        replacements.

    :rtype: tuple[Element, list]
    :returns: The Element with its ``a`` elements altered, and the list of
        broken URLs.
    """
    tested_urls = []  # type: List[str]
    for desc in el.iterdescendants():
        if desc.tag == "a" and "href" in desc.attrib:
            fixed_url, tested_urls, broken_urls = fix_url(
                base_url, desc.attrib["href"], tested_urls, broken_urls,
                urls_to_change)
            desc.attrib["href"] = fixed_url
    return (el, broken_urls)

Esempio n. 31

0

Mostra file

File: html_converter.py Progetto: wbing520/covid-data-pipeline

    def convert_ga(self, doc: html.Element) -> Dict:

        t = doc.findall(".//table")
        if len(t) == 0: 
            return { "error": "no tables -> page layout changed", "at": udatetime.now_as_utc() }

        data = self._htmltable_to_dict(t[0])
        if len(data["data"]) != 2:
            return { "error": "expected two data rows", "at": udatetime.now_as_utc() }
        if data["data"][0]["COVID-19 Confirmed Cases"] != "Total":
            return { "error": "first row should be totals", "at": udatetime.now_as_utc() }
        if data["data"][1]["COVID-19 Confirmed Cases"] != "Deaths":
            return { "error": "second row should be deaths", "at": udatetime.now_as_utc() }

        positive = data["data"][0]["No. Cases (%)"]
        positive = int(positive[0: positive.index("(")])
        deaths = data["data"][1]["No. Cases (%)"]
        deaths = int(deaths[0: deaths.index("(")])

        data = self._htmltable_to_dict(t[1])
        if len(data["data"]) != 2:
            return { "error": "expected two data rows", "at": udatetime.now_as_utc() }
        if data["data"][0]["Lab"] != "Commercial Lab":
            return { "error": "first row should be Commerial Lab", "at": udatetime.now_as_utc() }
        if data["data"][1]["Lab"] != "GPHL":
            return { "error": "second row should be GPHL", "at": udatetime.now_as_utc() }
        lab_1 = int(data["data"][0]["Total Tests"])
        lab_2 = int(data["data"][1]["Total Tests"])
        tests = lab_1 + lab_2

        return {
            "positive": positive,
            "tests": tests,
            "deaths": deaths
        }

Esempio n. 32

0

Mostra file

def fragment_fromstring(html,
                        create_parent=False,
                        guess_charset=None,
                        parser=None):
    """Parses a single HTML element; it is an error if there is more than
    one element, or if anything but whitespace precedes or follows the
    element.

    If 'create_parent' is true (or is a tag name) then a parent node
    will be created to encapsulate the HTML in a single element.  In
    this case, leading or trailing text is allowed.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError("string required")

    accept_leading_text = bool(create_parent)

    elements = fragments_fromstring(
        html,
        guess_charset=guess_charset,
        parser=parser,
        no_leading_text=not accept_leading_text,
    )

    if create_parent:
        if not isinstance(create_parent, _strings):
            create_parent = "div"
        new_root = Element(create_parent)
        if elements:
            if isinstance(elements[0], _strings):
                new_root.text = elements[0]
                del elements[0]
            new_root.extend(elements)
        return new_root

    if not elements:
        raise etree.ParserError("No elements found")
    if len(elements) > 1:
        raise etree.ParserError("Multiple elements found")
    result = elements[0]
    if result.tail and result.tail.strip():
        raise etree.ParserError("Element followed by text: %r" % result.tail)
    result.tail = None
    return result

Esempio n. 33

0

Mostra file

File: scrape_tips_for_treasurers.py Progetto: FECHoosier/fec-cms

def innerhtml(el: Element, encoding: str = "utf-8") -> str:
    """
    Returns the HTML of an element as a ``str``, with the opening and closing
    tags removed.

    :arg Element el: ``lxml.html.Element`` object.
    :arg str encoding: The character encoding for the HTML.

    :rtype: str
    :returns: A string of HTML without the opening and closing tags.
    """
    children = [_ for _ in el.iterchildren()]
    if not len(children):
        return el.text_content()
    text = "%s" % el.text if el.text else ""
    return "%s%s" % (text, "".join(
        [tostring(c).decode(encoding) for c in el.iterchildren()]))

Esempio n. 34

0

Mostra file

File: int_to_ecl.py Progetto: theincredebles/PyIntelliJ2Eclipse

def main():
    absolute_folder = sys.argv[1]
    pattern = '*.iml'
    fileList = []
    # Walk through directory
    for dName, sdName, fList in os.walk(absolute_folder):
        for fileName in fList:
            if fnmatch.fnmatch(fileName, pattern): # Match search string
                fileList.append(os.path.join(dName, fileName))
    pbar = ProgressBar(widgets=['Processing :', Percentage(), ' ', Bar(), ' ', ETA()], maxval=len(fileList)).start()
    fcount = 0
    for fileName in fileList:
        output_dict = generate_empty_dict()
        eclipse_file_path = os.path.dirname(fileName)+'/.classpath'
        with open(fileName, 'r') as f:
            intellij_data = f.read()
        if not intellij_data:
            pass
        intellij_dict = xmltodict.parse(intellij_data)
        fcount = fcount + 1
        # print(intellij_dict)
        output_dict = addSrcType(intellij_dict, output_dict)
        output_dict = addCombinedRules(intellij_dict, output_dict)
        output_dict = addConType(intellij_dict, output_dict)
        # print json.dumps(intellij_dict)
        result = bf.etree(output_dict, root=Element('classpath'))

        #print tostring(result)
        with open(eclipse_file_path, 'w') as f:
            data = tostring(result, doctype='<?xml version="1.0" encoding="UTF-8"?>')
            data = data.replace('<classpath>','')
            data = data.replace('</classpath>', '')
            data = data.replace('<?xml version="1.0" encoding="UTF-8"?>', '<?xml version="1.0" encoding="UTF-8"?><classpath>')
            data = data +'</classpath>'
            f.write(data)
        # Add .project file
        project_path = os.path.dirname(fileName)+'/.project'
        xml_data = """<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
    <name>%s</name>
    <comment/>
    <projects/>
    <buildSpec>
	<buildCommand>
		<name>org.eclipse.jdt.core.javabuilder</name>
		<arguments/>
	</buildCommand>
    </buildSpec>
    <natures>
	<nature>org.eclipse.jdt.core.javanature</nature>
    </natures>
</projectDescription>"""
        root_name = os.path.splitext(os.path.basename(fileName))[0]
        xml_data = xml_data%(root_name)
        with open(project_path, 'w') as f:
            f.write(xml_data)
            pbar.update(fcount)
    pbar.finish()

Esempio n. 35

0

Mostra file

File: html5parser.py Progetto: ArthurGarnier/SickRage

def fragment_fromstring(html, create_parent=False,
                        guess_charset=None, parser=None):
    """Parses a single HTML element; it is an error if there is more than
    one element, or if anything but whitespace precedes or follows the
    element.

    If 'create_parent' is true (or is a tag name) then a parent node
    will be created to encapsulate the HTML in a single element.  In
    this case, leading or trailing text is allowed.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')

    accept_leading_text = bool(create_parent)

    elements = fragments_fromstring(
        html, guess_charset=guess_charset, parser=parser,
        no_leading_text=not accept_leading_text)

    if create_parent:
        if not isinstance(create_parent, _strings):
            create_parent = 'div'
        new_root = Element(create_parent)
        if elements:
            if isinstance(elements[0], _strings):
                new_root.text = elements[0]
                del elements[0]
            new_root.extend(elements)
        return new_root

    if not elements:
        raise etree.ParserError('No elements found')
    if len(elements) > 1:
        raise etree.ParserError('Multiple elements found')
    result = elements[0]
    if result.tail and result.tail.strip():
        raise etree.ParserError('Element followed by text: %r' % result.tail)
    result.tail = None
    return result

Esempio n. 36

0

Mostra file

File: postprocess.py Progetto: petrushev/timeline2cmk

def postprocess(doc):
    # put html lang
    doc.attrib["lang"] = "mk"

    # add meta charset
    doc.cssselect("head")[0].insert(0, Element("meta", attrib={"charset": "utf-8"}))

    # hotlink customized css
    doc.cssselect("head link")[0].attrib["href"] = "stylesheets/timeline-setter-custom.css"

    # add title
    title = Element("title")
    title.text = u"Слободен софтвер Македонија низ годините"
    doc.cssselect("head")[0].insert(0, title)

    # add header
    header = load_partial("header.html")
    doc.cssselect("body")[0].insert(0, header)

    return doc

Esempio n. 37

0

Mostra file

File: reference.py Progetto: DMOJ/site

def get_user(username, data):
    if not data:
        element = Element('span')
        element.text = username
        return element

    element = Element('span', {'class': Profile.get_user_css_class(*data)})
    link = Element('a', {'href': reverse('user_page', args=[username])})
    link.text = username
    element.append(link)
    return element

Esempio n. 38

0

Mostra file

File: www.py Progetto: tricknik/sweetpotato

 def clean(self, element):
     cleanElement = None
     dropEmpty = ('span', 'p', 'div') 
     downloadDir = self.task.getProperty('download')
     if 'img' == element.tag:
            src = urlparse.urljoin(self.url, element.attrib['src'])
            file, info = urllib.urlretrieve(src)
            url = urlparse.urlparse(src)
            disposition = info.getheader('Content-Disposition')
            filename = None
            if disposition:
                type, filename = disposition.split(';')
                key, filename = filename.split('=')
                filename = filename.strip('"')
            if not filename:
                filename = os.path.basename(file)
            splitf = filename.split('.')
            lenf = len(splitf)
            ext = splitf.pop()
            if lenf < 2 or info.subtype != ext:
                filename = '.'.join((filename, info.subtype))
            element.attrib['src']  = filename
            os.rename(file, '/'.join((downloadDir, filename)))
     #moin specific hack for now
     if 'a' == element.tag and '/Category' in element.attrib['href']:
         pass
     elif element.tag not in dropEmpty \
             or bool(element.getchildren()) \
             or (bool(element.text) \
                 and bool(element.text.strip())):
         cleanElement = Element(element.tag)
         cleanElement.text = element.text
         stripattribs = ('class', 'style', 'id')
         for a in element.attrib:
             if a not in stripattribs:
                 cleanElement.set(a, element.attrib[a])  
         for e in element.getchildren():
             clean = (self.clean(e))
             if clean is not None:
                 cleanElement.append(clean)
     return cleanElement

Esempio n. 39

0

Mostra file

File: reference.py Progetto: DMOJ/site

def get_user_rating(username, data):
    if not data:
        element = Element('span')
        element.text = username
        return element

    rating = data[1]
    element = Element('a', {'class': 'rate-group', 'href': reverse('user_page', args=[username])})
    if rating:
        rating_css = rating_class(rating)
        rate_box = Element('span', {'class': 'rate-box ' + rating_css})
        rate_box.append(Element('span', {'style': 'height: %3.fem' % rating_progress(rating)}))
        user = Element('span', {'class': 'rating ' + rating_css})
        user.text = username
        element.append(rate_box)
        element.append(user)
    else:
        element.text = username
    return element

Esempio n. 40

0

Mostra file

File: middleware.py Progetto: natea/Deliverance

 def view_selection(self, req, resp, url):
     """
     View the highlighted selector (from `action_view`)
     """
     from deliverance.selector import Selector
     doc = document_fromstring(resp.body)
     el = Element('base')
     el.set('href', posixpath.dirname(url) + '/')
     doc.head.insert(0, el)
     selector = Selector.parse(req.GET['selector'])
     dummy_type, elements, dummy_attributes = selector(doc)
     if not elements:
         template = self._not_found_template
     else:
         template = self._found_template
     all_elements = []
     els_in_head = False
     for index, el in enumerate(elements):
         el_in_head = self._el_in_head(el)
         if el_in_head:
             els_in_head = True
         anchor = 'deliverance-selection'
         if index:
             anchor += '-%s' % index
         if el.get('id'):
             anchor = el.get('id')
         ## FIXME: is a <a name> better?
         if not el_in_head:
             el.set('id', anchor)
         else:
             anchor = None
         ## FIXME: add :target CSS rule
         ## FIXME: or better, some Javascript
         all_elements.append((anchor, el))
         if not el_in_head:
             style = el.get('style', '')
             if style:
                 style += '; '
             style += '/* deliverance */ border: 2px dotted #f00'
             el.set('style', style)
         else:
             el.set('DELIVERANCE-MATCH', '1')
     def highlight(html_code):
         """Highlights the given code (for use in the template)"""
         if isinstance(html_code, _Element):
             html_code = tostring(html_code)
         return html(pygments_highlight(html_code, HtmlLexer(),
                                        HtmlFormatter(noclasses=True)))
     def format_tag(tag):
         """Highlights the lxml HTML tag"""
         return highlight(tostring(tag).split('>')[0]+'>')
     def wrap_html(html, width=100):
         if isinstance(html, _Element):
             html = tostring(html)
         lines = html.splitlines()
         new_lines = []
         def wrap_html_line(line):
             if len(line) <= width:
                 return [line]
             match_trail = re.search(r'^[^<]*</.*?>', line, re.S)
             if match_trail:
                 result = [match_trail.group(0)]
                 result.extend(wrap_html_line(line[match_trail.end():]))
                 return result
             match1 = re.search(r'^[^<]*<[^>]*>', line, re.S)
             match2 = re.search(r'<[^>]*>[^<>]*$', line, re.S)
             if not match1 or not match2:
                 return [line]
             result = [match1.group(0)]
             result.extend(wrap_html_line(line[match1.end():match2.start()]))
             result.append(match2.group(0))
             return result
         for line in lines:
             new_lines.extend(wrap_html_line(line))
         return '\n'.join(new_lines)
     def mark_deliv_match(highlighted_text):
         result = re.sub(r'(?:<[^/][^>]*>)*&lt;.*?DELIVERANCE-MATCH=.*?&gt;(?:</[^>]*>)*', lambda match: r'<b style="background-color: #ff8">%s</b>' % match.group(0), unicode(highlighted_text), re.S)
         return html(result)
     text = template.substitute(
         base_url=url,
         els_in_head=els_in_head, doc=doc,
         elements=all_elements, selector=selector, 
         format_tag=format_tag, highlight=highlight, 
         wrap_html=wrap_html, mark_deliv_match=mark_deliv_match)
     message = fromstring(
         self._message_template.substitute(message=text, url=url))
     if doc.body.text:
         message.tail = doc.body.text
         doc.body.text = ''
     doc.body.insert(0, message)
     text = tostring(doc)
     return Response(text)

Esempio n. 41

0

Mostra file

File: make_c_coverage.py Progetto: Abushalaa/obspy

    perc = float(output[1].split(':')[1].split('%')[0])
    gcov = output[2].strip().split()[1].strip("'")

    # move generated gcov to coverage folder
    new_dir = os.path.join(target_dir, os.path.dirname(source))
    try:
        os.makedirs(new_dir)
    except OSError:
        pass
    os.rename(os.path.join(obspy_dir, gcov), os.path.join(new_dir, gcov))
    cov.append((filename, os.path.join(new_dir, gcov), perc))


# GENERATE HTML
page = fromstring("<html><table></table></html>")
table = page.xpath('.//table')[0]
for name, gcov, perc in cov:
    td1, td2 = Element('td'), Element('td')
    gcov = gcov.replace(target_dir, './')
    a = Element('a', attrib={'href': gcov})
    a.text = name
    td1.append(a)
    td2.text = "%6.2f%%" % perc
    tr = Element('tr')
    tr.extend([td1, td2])
    table.append(tr)
with open(os.path.join(target_dir, 'index.html'), 'wb') as fp:
    fp.write(tostring(page))

cleanup('*.o')