def guessParagraphs(s, dont_wrap=["h1", "h2", "h3", "pre", "center", "table"]):
    # parse string and make it double-linked tree
    node = d.parseString(s)
    d.makeDoubleLinked(node)

    # get all elements between <hx> (headers) - they will be converted to
    # <p>aragraphs
    tmp = []
    buffs = []
    for el in node.childs[0].childs:
        if el.getTagName().lower() in dont_wrap and not el.isEndTag():
            buffs.append(tmp)
            tmp = []
        else:
            tmp.append(el)
    buffs.append(tmp)

    # process paragraphs
    for buff in buffs:
        __processBuffer(buff)

    # remove blank <p>aragraphs
    map(
        lambda x: x.replaceWith(d.HTMLElement("")),
        filter(
            lambda x: x.getContent().strip() == "",
            node.find("p")
        )
    )

    replacements = [
        ("<p>",         "\n<p>"),
        ("</p>",        "</p>\n\n"),
        ("<p>\n",       "<p>"),
        ("<h",          "\n<h"),
        ("\t",          ""),
        ("<p><br />\n", "<p>"),
        ("<p></p>\n",   ""),
    ]

    regular_replacements = [
        (r"• (.*)</p>\n", r"<li>\1</li>\n</p>\n"),
        (r"• (.*)\n", r"<li>\1</li>\n"),
    ]

    str_node = str(node)

    for replacement in replacements:
        str_node = str_node.replace(replacement[0], replacement[1])

    for replacement in regular_replacements:
        str_node = re.sub(replacement[0], replacement[1], str_node)

    return str_node
def getNodePath(dom, nodeid):
    "Retun file path of node with given |nodeid|."

    # check if dom is already double-linked list
    if not hasattr(dom.childs[0], 'parent') or dom.childs[0].parent != dom:
        d.makeDoubleLinked(dom)

    # get reference to node
    node = dom.find("node", {"unique_id": str(nodeid)})[0]

    # check for filename in tags
    new_filename = None
    if "tags" in node.params and node.params["tags"].strip() != "":  # if tags are in node definition
        for i in node.params["tags"].split():                        # go thru tags
            if i.startswith("filename:"):                            # look for tag which starts with filename:
                i = i.split(":")
                new_filename = i[1] if len(i) > 1 else None
                break

    # does this node contain another nodes?
    endpoint = len(node.find("node")) <= 1

    # get path (based on node path in dom)
    path = ""
    while node.parent is not None and node.getTagName().lower() == "node":
        path = node.params["name"] + "/" + path
        node = node.parent

    if endpoint:
        path = path[:-1]  # remove '/' from end of the path
    else:
        path += "index"   # index file for directory
    path += ".html"

    # apply new_filename from from tags parameter of node
    if new_filename is not None:
        path = os.path.dirname(path)
        path += "/" if path.strip() != "" else ""
        path += new_filename

    return utfToFilename(path)
Example #3
0
def guessParagraphs(s, dont_wrap=["h1", "h2", "h3", "pre", "center", "table"]):
    # parse string and make it double-linked tree
    node = d.parseString(s)
    d.makeDoubleLinked(node)

    # get all elements between <hx> (headers) - they will be converted to
    # <p>aragraphs
    tmp = []
    buffs = []
    for el in node.childs[0].childs:
        if el.getTagName().lower() in dont_wrap and not el.isEndTag():
            buffs.append(tmp)
            tmp = []
        else:
            tmp.append(el)
    buffs.append(tmp)

    # process paragraphs
    for buff in buffs:
        __processBuffer(buff)

    # remove blank <p>aragraphs
    map(
        lambda x: x.replaceWith(d.HTMLElement("")),
        filter(
            lambda x: x.getContent().strip() == "",
            node.find("p")
        )
    )

    # return "beautified" string
    return str(node)                               \
                    .replace("<p>", "\n<p>")       \
                    .replace("</p>", "</p>\n\n")   \
                    .replace("<p>\n", "<p>")       \
                    .replace("<h", "\n<h")         \
                    .replace("<p><br />\n", "<p>")  # don't ask..