def _getTitleElements(docelement, mainTitle): items = pyquery.PyQuery(docelement)('body') if not items: return [] bodyElement = items[0] result = [] textFunc = tailFunc = lambda text: text and mainTitle in text lxmlutil.findAllVisibleMatched(result, bodyElement, textFunc, tailFunc) return result
def _getPublishedInside(publishedFormat, element): items = [] funcResult = [] textFunc = tailFunc = lambda text: _getPublished(publishedFormat, text) lxmlutil.findAllVisibleMatched(items, element, textFunc, tailFunc, funcResult, includeSelf=True) if items: return items[0], funcResult[0][0], funcResult[0][1] return None
def _getMainElement(contentElement, titleElement): _MIN_MAIN_LENGTH = 100 items = [] lxmlutil.findAllVisibleMatched(items, contentElement) result = [] for item in items: weight = _getChildTextLength(item) result.append([weight, item]) result2 = [item for item in result if item[0] >= _MIN_MAIN_LENGTH] for item in result2: # an element with more children is prefered. # an elment closer to title element is prefered. # 11 is used to avoid divide by 0 and a more balance. # eg, (m/2, n/22), (m/12, n/32); the later n has more chance. item[0] = item[0] * len(item[1].getchildren() ) * 1.0 / (abs(item[1].sourceline - titleElement.sourceline) + 11) if result2: return max(result2, key=lambda item: item[0])[1] return max(result, key=lambda item: item[0])[1]