def parse(publishedFormat, titleElement, mainElement): if titleElement.sourceline >= mainElement.sourceline: return None publishedResult = None element = lxmlutil.getFullNext(titleElement) while element.sourceline < mainElement.sourceline: publishedResult = _getPublishedInside(publishedFormat, element) if publishedResult: break element = lxmlutil.getFullNext(element) return publishedResult
def _getNextText(element): while element is not None: if lxmlutil.isVisibleElement(element) and element.text: return element.text.strip() elif element.tail: return element.tail.strip() element = lxmlutil.getFullNext(element) return None
def parse(url, contentElement, titleElement, mainElement): startLine = titleElement.sourceline mainNext = lxmlutil.getFullNext(mainElement) if mainNext is None: endLine = -1 else: endLine = mainNext.sourceline items = pyquery.PyQuery(contentElement)('img') result = [] for item in items: if item.sourceline < startLine: continue if endLine > 0 and item.sourceline > endLine: break img = _parseImg(url, item) if img: result.append(img) return result