def innerhtml(el: Element, encoding: str = "utf-8") -> str:
    """
    Returns the HTML of an element as a ``str``, with the opening and closing
    tags removed.

    :arg Element el: ``lxml.html.Element`` object.
    :arg str encoding: The character encoding for the HTML.

    :rtype: str
    :returns: A string of HTML without the opening and closing tags.
    """
    children = [_ for _ in el.iterchildren()]
    if not len(children):
        return el.text_content()
    text = "%s" % el.text if el.text else ""
    return "%s%s" % (text, "".join(
        [tostring(c).decode(encoding) for c in el.iterchildren()]))
def parse_tip(url: str, datestamp: str, cell: Element, encoding: str) -> Tip:
    """
    Create a ``Tip`` object out of the information we have and the HTML element
    for the table cell.

    The majority of the cells open with a ``strong`` element whose content is
    the title, but the HTML isn't completely consistent.

    If that element is there, we want its contents (as HTML) and then the
    remaining contents of the cell (also as HTML), but we want the contents as
    a string, stripped of the opening and closing tags.

    :arg str url: The URL for the page.
    :arg str datestamp: ISO 8601 date for the tip.
    :arg Element cell: ``lxml.html.Element`` object, HTML for the table cell.
    :arg str encoding: The encoding for the page.

    :rtype: Tip
    :returns: The parsed tip as a ``Tip`` object.
    """
    children = [_ for _ in cell.iterchildren()]
    first = children[0]
    # The ``strong`` element is often the first child:
    if first.tag == "strong":
        tail = "%s " % first.tail.strip() if first.tail else ""
        title = innerhtml(first)
        new_children = children[1:]
    # But sometimes there's an enclosing ``p`` element:
    elif first.tag == "p":
        strong = first.xpath("./strong")[0]
        title = innerhtml(strong)
        tail = "%s " % strong.tail.strip() if strong.tail else ""
        # Note: none of the ``p`` elements have trailing content, so we don't
        # have to worry about first.tail.
        new_children = [_ for _ in first.iterchildren()][1:]
    # And sometimes there's no ``strong`` element at all:
    elif "strong" not in [_.tag for _ in children]:
        # Some of the early tips don't have titles.
        title_date = datetime.strptime(datestamp,
                                       "%Y-%m-%d").strftime("%B %d, %Y")
        title = "Timely Tip for %s" % title_date
        tail = ""
        new_children = children
    # We don't know what to do if none of those conditions are met.
    else:
        raise

    # Now we want the rest of the HTML (not quite the same as ``innerhtml()``)
    body = "%s%s" % (tail, "".join(
        [tostring(c).decode(encoding) for c in new_children]))

    return Tip(body=body.strip(),
               posted_date=datestamp,
               previous_url=url,
               title_text=title.strip().rstrip('.'))
Esempio n. 3
0
def get_html_strings(element: Element) -> Iterator[str]:
    """Yields HTML text from an element."""

    try:
        first, *children = element.getchildren()
    except ValueError:
        yield tostring(element).decode()
        return

    # Remove <p>…</p> wrapper created by Cleaner.clean_html().
    if not children and first.tag == 'p':
        if first.text:
            yield first.text

        for child in first.iterchildren():
            yield tostring(child).decode()
    else:
        for child in element.iterchildren():
            yield tostring(child).decode()