Python getPureString Examples

Programming Language: Python

Namespace/Package Name: commonutil.lxmlutil

Method/Function: getPureString

Examples at hotexamples.com: 4

Python getPureString - 4 examples found. These are the top rated real world Python examples of commonutil.lxmlutil.getPureString extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: pageinfo.py Project: writinglin/commonfunction

def fetch(url):
    result = {}
    fetcher = ContentFetcher(url, tried=2)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return result
    try:
        htmlelement = lxml.html.fromstring(content)
    except Exception:
        logging.error('Failed to load html from content.')
        return result
    match = pyquery.PyQuery(htmlelement)('head meta[name=keywords]')
    if match:
        mainElement = match[0]
        keywords = mainElement.get('content')
        if keywords:
            result['keywords'] = lxmlutil.getPureString(keywords)
    match = pyquery.PyQuery(htmlelement)('head meta[name=description]')
    if match:
        mainElement = match[0]
        description = mainElement.get('content')
        if description:
            result['description'] = lxmlutil.getPureString(description)
    match = pyquery.PyQuery(htmlelement)('head title')
    if match:
        mainElement = match[0]
        title = mainElement.text_content()
        if title:
            result['title'] = lxmlutil.getPureString(title)
    return result

Example #2

Show file

def getElementValue(element, selector):
    main = selector
    attr = None
    if selector.endswith(']'):
        rindex = selector.rfind('[')
        # if '"' is found, it must be 'attr="value"' attribute selector.
        if rindex >= 0 and selector.find('"', rindex) < 0:
            main = selector[:rindex]
            attr = selector[rindex + 1:-1]
    reservedAttrs = ['@text', '@tail']
    mainElement = None
    if main == 'self':
        mainElement = element
    elif main == 'parent':
        mainElement = element.getparent()
    else:
        if attr and attr not in reservedAttrs:
            # element with required attribute
            main = main + '[' + attr + ']'
        match = pyquery.PyQuery(element)(main)
        if match:
            mainElement = match[0]
    if mainElement is None:
        return None
    if attr:
        if attr == '@text':
            value = mainElement.text
        elif attr == '@tail':
            value = mainElement.tail
        else:
            value = mainElement.get(attr)
        value = lxmlutil.getPureString(value)
        return value
    return mainElement

Example #3

Show file

File: htmlcontentparser.py Project: economylin/newsmonitor

def getElementValue(element, selector):
    main = selector
    attr = None
    if selector.endswith("]"):
        rindex = selector.rfind("[")
        # if '"' is found, it must be 'attr="value"' attribute selector.
        if rindex >= 0 and selector.find('"', rindex) < 0:
            main = selector[:rindex]
            attr = selector[rindex + 1 : -1]
    reservedAttrs = ["@text", "@tail"]
    mainElement = None
    if main == "self":
        mainElement = element
    elif main == "parent":
        mainElement = element.getparent()
    else:
        if attr and attr not in reservedAttrs:
            # element with required attribute
            main = main + "[" + attr + "]"
        match = pyquery.PyQuery(element)(main)
        if match:
            mainElement = match[0]
    if mainElement is None:
        return None
    if attr:
        if attr == "@text":
            value = mainElement.text
        elif attr == "@tail":
            value = mainElement.tail
        else:
            value = mainElement.get(attr)
        value = lxmlutil.getPureString(value)
        return value
    return mainElement

Example #4

Show file

File: alexainfo.py Project: writinglin/commonfunction

def getDmozInfo(tree):
    dmoz = {}
    sites = tree.xpath('/ALEXA/DMOZ/SITE')
    if sites:
        site = sites[0]
        desc = site.get('DESC')
        if desc:
            dmoz['desc'] = desc
        categories = site.xpath('CATS/CAT')
        # 'CATS/CAT/@ID' will return lxml.etree._ElementUnicodeResult
        # but HtmlElement.get(attr) will return normal string
        # TODO: testcase to validate it
        if categories:
            dmoz['categories'] = [lxmlutil.getPureString(category.get('ID'))
                                    for category in categories]
    return dmoz