def _gnewsItem2Page(item): pageItem = {} pageItem["title"] = htmlutil.getTextContent(item.get("title")) pageItem["url"] = item.get("unescapedUrl") pageItem["content"] = htmlutil.getTextContent(item.get("content")) pageItem["publisher"] = item.get("publisher") pageItem["published"] = dateutil.jsDate2utc14(item.get("publishedDate")) if item.get("image"): img = {} img["url"] = item["image"].get("url") img["width"] = item["image"].get("tbWidth") img["height"] = item["image"].get("tbHeight") pageItem["img"] = img return pageItem
def getItemFromScript(element): if element.tag != 'a': return None url = element.get('href') title = lxmlutil.getScriptConstantString(element) if title: title = htmlutil.getTextContent(title) item = {} if url: item['url'] = url if title: item['title'] = title return item
def getItemFromScript(element): if element.tag != "a": return None url = element.get("href") title = lxmlutil.getScriptConstantString(element) if title: title = htmlutil.getTextContent(title) item = {} if url: item["url"] = url if title: item["title"] = title return item
def _googleItem2page(item): pageItem = {} pageItem['title'] = htmlutil.getTextContent(item.get('title')) pageItem['url'] = item.get('unescapedUrl') pageItem['content'] = htmlutil.getTextContent(item.get('content')) return pageItem