Example #1
0
def _getTitleElements(docelement, mainTitle):
    items = pyquery.PyQuery(docelement)('body')
    if not items:
        return []
    bodyElement = items[0]
    result = []
    textFunc = tailFunc = lambda text: text and mainTitle in text
    lxmlutil.findAllVisibleMatched(result, bodyElement, textFunc, tailFunc)
    return result
def _getPublishedInside(publishedFormat, element):
    items = []
    funcResult = []
    textFunc = tailFunc = lambda text: _getPublished(publishedFormat, text)
    lxmlutil.findAllVisibleMatched(items, element, textFunc, tailFunc,
                                    funcResult, includeSelf=True)

    if items:
        return items[0], funcResult[0][0], funcResult[0][1]
    return None
def _getMainElement(contentElement, titleElement):
    _MIN_MAIN_LENGTH = 100
    items = []
    lxmlutil.findAllVisibleMatched(items, contentElement)
    result = []
    for item in items:
        weight = _getChildTextLength(item)
        result.append([weight, item])
    result2 = [item for item in result if item[0] >= _MIN_MAIN_LENGTH]
    for item in result2:
        # an element with more children is prefered.
        # an elment closer to title element is prefered.
        # 11 is used to avoid divide by 0 and a more balance.
        # eg, (m/2, n/22), (m/12, n/32); the later n has more chance.
        item[0] = item[0] * len(item[1].getchildren()
                    ) * 1.0 / (abs(item[1].sourceline - titleElement.sourceline) + 11)
    if result2:
        return max(result2, key=lambda item: item[0])[1]
    return max(result, key=lambda item: item[0])[1]