def fetch(url): result = {} fetcher = ContentFetcher(url, tried=2) fetchResult = fetcher.fetch() content = fetchResult.get('content') if not content: return result try: htmlelement = lxml.html.fromstring(content) except Exception: logging.error('Failed to load html from content.') return result match = pyquery.PyQuery(htmlelement)('head meta[name=keywords]') if match: mainElement = match[0] keywords = mainElement.get('content') if keywords: result['keywords'] = lxmlutil.getPureString(keywords) match = pyquery.PyQuery(htmlelement)('head meta[name=description]') if match: mainElement = match[0] description = mainElement.get('content') if description: result['description'] = lxmlutil.getPureString(description) match = pyquery.PyQuery(htmlelement)('head title') if match: mainElement = match[0] title = mainElement.text_content() if title: result['title'] = lxmlutil.getPureString(title) return result
def getElementValue(element, selector): main = selector attr = None if selector.endswith(']'): rindex = selector.rfind('[') # if '"' is found, it must be 'attr="value"' attribute selector. if rindex >= 0 and selector.find('"', rindex) < 0: main = selector[:rindex] attr = selector[rindex + 1:-1] reservedAttrs = ['@text', '@tail'] mainElement = None if main == 'self': mainElement = element elif main == 'parent': mainElement = element.getparent() else: if attr and attr not in reservedAttrs: # element with required attribute main = main + '[' + attr + ']' match = pyquery.PyQuery(element)(main) if match: mainElement = match[0] if mainElement is None: return None if attr: if attr == '@text': value = mainElement.text elif attr == '@tail': value = mainElement.tail else: value = mainElement.get(attr) value = lxmlutil.getPureString(value) return value return mainElement
def getElementValue(element, selector): main = selector attr = None if selector.endswith("]"): rindex = selector.rfind("[") # if '"' is found, it must be 'attr="value"' attribute selector. if rindex >= 0 and selector.find('"', rindex) < 0: main = selector[:rindex] attr = selector[rindex + 1 : -1] reservedAttrs = ["@text", "@tail"] mainElement = None if main == "self": mainElement = element elif main == "parent": mainElement = element.getparent() else: if attr and attr not in reservedAttrs: # element with required attribute main = main + "[" + attr + "]" match = pyquery.PyQuery(element)(main) if match: mainElement = match[0] if mainElement is None: return None if attr: if attr == "@text": value = mainElement.text elif attr == "@tail": value = mainElement.tail else: value = mainElement.get(attr) value = lxmlutil.getPureString(value) return value return mainElement
def getDmozInfo(tree): dmoz = {} sites = tree.xpath('/ALEXA/DMOZ/SITE') if sites: site = sites[0] desc = site.get('DESC') if desc: dmoz['desc'] = desc categories = site.xpath('CATS/CAT') # 'CATS/CAT/@ID' will return lxml.etree._ElementUnicodeResult # but HtmlElement.get(attr) will return normal string # TODO: testcase to validate it if categories: dmoz['categories'] = [lxmlutil.getPureString(category.get('ID')) for category in categories] return dmoz