Beispiel #1
0
def analyse(url, content, editorFormat, monitorTitle=None, fortest=False, elementResult={}):
    page = {}
    docelement = lxml.html.fromstring(content)

    titleFormat = editorFormat.get('title', {})
    title, titleeEements = titleparser.parse(titleFormat, url, docelement, monitorTitle, fortest)
    if title:
        page['title'] = title
    if not titleeEements:
        return page
    if elementResult is not None:
        elementResult['titles'] = titleeEements
    titleElement, contentElement = contentparser.parse(titleeEements)
    if titleElement is not None:
        page['title'] = lxmlutil.getCleanText(titleElement)
    if elementResult is not None and titleElement is not None:
        elementResult['element'] = {}
        elementResult['text'] = {}

        elementResult['element']['title'] = (titleElement.tag, titleElement.sourceline)
        elementResult['text']['title'] = lxmlutil.getCleanText(titleElement)

        elementResult['element']['content'] = (contentElement.tag, contentElement.sourceline)
        elementResult['text']['content'] = lxmlutil.getCleanText(contentElement)

    paragraphFormat = editorFormat.get('paragraph', {})
    mainElement, paragraphs = paragraphparser.parse(paragraphFormat, contentElement, titleElement)
    if paragraphs:
        page['paragraphs'] = paragraphs
        page['content'] = digestparser.parse(paragraphFormat, paragraphs)
    if elementResult is not None and mainElement is not None:
        elementResult['element']['main'] = (mainElement.tag, mainElement.sourceline)
        elementResult['text']['main'] = lxmlutil.getCleanText(mainElement)

    if paragraphs:
        publishedElement = None
        publishedFormat = editorFormat.get('published', {})
        publishedResult = publishedparser.parse(publishedFormat, titleElement, mainElement)
        if publishedResult:
            page['publishedtext'] = publishedResult[1]
            page['published'] = publishedResult[2]
            publishedElement = publishedResult[0]
        if elementResult is not None and publishedElement is not None:
            elementResult['element']['published'] = (publishedElement.tag, publishedElement.sourceline)
            if publishedElement is not None:
                elementResult['text']['published'] = lxmlutil.getCleanText(publishedElement)

        images = imgparser.parse(url, contentElement, titleElement, mainElement)
        if images:
            page['images'] = images

    return page
def _getMainElement(titleElement):
    parent = titleElement
    p_parent = titleElement.getparent()
    if p_parent is None:
        return None

    result = []
    while p_parent is not None:
        len1 = len(lxmlutil.getCleanText(parent))
        len2 = len(lxmlutil.getCleanText(p_parent))
        # title and parent element should as close as possible.
        weight = (len2 - len1) - math.pow(titleElement.sourceline - p_parent.sourceline, 2)
        result.append((weight, p_parent))
        parent = p_parent
        p_parent = p_parent.getparent()

    return max(result, key=lambda item: item[0])
Beispiel #3
0
def getValueBySelectors(element, selectors):
    result = None
    for selector in selectors:
        matched = getElementValue(element, selector)
        if matched is not None:
            if isinstance(matched, basestring):
                result = matched
            else:
                result = lxmlutil.getCleanText(matched)
        if result:
            break
    return result
def getValueBySelectors(element, selectors):
    result = None
    for selector in selectors:
        matched = getElementValue(element, selector)
        if matched is not None:
            if isinstance(matched, basestring):
                result = matched
            else:
                result = lxmlutil.getCleanText(matched)
        if result:
            break
    return result
def _getParagraphsByTag(element, tag):
    result = []
    for item in element.getchildren():
        if item.tag != tag:
            continue
        content = lxmlutil.getCleanText(item)
        if not content:
            content = item.tail
            if content:
                content = content.strip()
        if content:
            result.append(content)
    return result
def _detectDetailUrl(url, title):
    tried = 2
    fetcher = ContentFetcher(url,tried=tried)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return None
    docelement = lxml.html.fromstring(content)
    aElements = pyquery.PyQuery(docelement)('a')
    for aElement in aElements:
        if lxmlutil.getCleanText(aElement) != title:
            continue
        detailUrl = aElement.get('href')
        if detailUrl:
            detailUrl = urlparse.urljoin(url, detailUrl)
            return detailUrl
    return None
Beispiel #7
0
def _detectDetailUrl(url, title):
    tried = 2
    fetcher = ContentFetcher(url, tried=tried)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return None
    docelement = lxml.html.fromstring(content)
    aElements = pyquery.PyQuery(docelement)('a')
    for aElement in aElements:
        if lxmlutil.getCleanText(aElement) != title:
            continue
        detailUrl = aElement.get('href')
        if detailUrl:
            detailUrl = urlparse.urljoin(url, detailUrl)
            return detailUrl
    return None
def _getParagraphLengthByLink(element):
    if element.tag == 'li':
        return 0
    result = 0
    if element.text:
        result += len(element.text.strip())
    for item in element.getchildren():
        # treat br specially, it is used as paragraph separator by some site
        if item.tag == 'br':
            continue
        if item.tag not in lxmlutil.INLINE_TAGS:
            continue
        text = lxmlutil.getCleanText(item)
        if text:
            result += len(text)
        if item.tail:
            result += len(item.tail.strip())
    return result