Example #1
0
def fetch(url):
    result = {}
    fetcher = ContentFetcher(url, tried=2)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return result
    try:
        htmlelement = lxml.html.fromstring(content)
    except Exception:
        logging.error('Failed to load html from content.')
        return result
    match = pyquery.PyQuery(htmlelement)('head meta[name=keywords]')
    if match:
        mainElement = match[0]
        keywords = mainElement.get('content')
        if keywords:
            result['keywords'] = lxmlutil.getPureString(keywords)
    match = pyquery.PyQuery(htmlelement)('head meta[name=description]')
    if match:
        mainElement = match[0]
        description = mainElement.get('content')
        if description:
            result['description'] = lxmlutil.getPureString(description)
    match = pyquery.PyQuery(htmlelement)('head title')
    if match:
        mainElement = match[0]
        title = mainElement.text_content()
        if title:
            result['title'] = lxmlutil.getPureString(title)
    return result
Example #2
0
def getElementValue(element, selector):
    main = selector
    attr = None
    if selector.endswith(']'):
        rindex = selector.rfind('[')
        # if '"' is found, it must be 'attr="value"' attribute selector.
        if rindex >= 0 and selector.find('"', rindex) < 0:
            main = selector[:rindex]
            attr = selector[rindex + 1:-1]
    reservedAttrs = ['@text', '@tail']
    mainElement = None
    if main == 'self':
        mainElement = element
    elif main == 'parent':
        mainElement = element.getparent()
    else:
        if attr and attr not in reservedAttrs:
            # element with required attribute
            main = main + '[' + attr + ']'
        match = pyquery.PyQuery(element)(main)
        if match:
            mainElement = match[0]
    if mainElement is None:
        return None
    if attr:
        if attr == '@text':
            value = mainElement.text
        elif attr == '@tail':
            value = mainElement.tail
        else:
            value = mainElement.get(attr)
        value = lxmlutil.getPureString(value)
        return value
    return mainElement
def getElementValue(element, selector):
    main = selector
    attr = None
    if selector.endswith("]"):
        rindex = selector.rfind("[")
        # if '"' is found, it must be 'attr="value"' attribute selector.
        if rindex >= 0 and selector.find('"', rindex) < 0:
            main = selector[:rindex]
            attr = selector[rindex + 1 : -1]
    reservedAttrs = ["@text", "@tail"]
    mainElement = None
    if main == "self":
        mainElement = element
    elif main == "parent":
        mainElement = element.getparent()
    else:
        if attr and attr not in reservedAttrs:
            # element with required attribute
            main = main + "[" + attr + "]"
        match = pyquery.PyQuery(element)(main)
        if match:
            mainElement = match[0]
    if mainElement is None:
        return None
    if attr:
        if attr == "@text":
            value = mainElement.text
        elif attr == "@tail":
            value = mainElement.tail
        else:
            value = mainElement.get(attr)
        value = lxmlutil.getPureString(value)
        return value
    return mainElement
Example #4
0
def getDmozInfo(tree):
    dmoz = {}
    sites = tree.xpath('/ALEXA/DMOZ/SITE')
    if sites:
        site = sites[0]
        desc = site.get('DESC')
        if desc:
            dmoz['desc'] = desc
        categories = site.xpath('CATS/CAT')
        # 'CATS/CAT/@ID' will return lxml.etree._ElementUnicodeResult
        # but HtmlElement.get(attr) will return normal string
        # TODO: testcase to validate it
        if categories:
            dmoz['categories'] = [lxmlutil.getPureString(category.get('ID'))
                                    for category in categories]
    return dmoz