Python getCleanText 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: commonutil.lxmlutil

메소드/함수: getCleanText

hotexamples.com에서의 예제들: 8

Python getCleanText - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 commonutil.lxmlutil.getCleanText에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: pageanalyst.py 프로젝트: innocencelin/newseditor

def analyse(url, content, editorFormat, monitorTitle=None, fortest=False, elementResult={}):
    page = {}
    docelement = lxml.html.fromstring(content)

    titleFormat = editorFormat.get('title', {})
    title, titleeEements = titleparser.parse(titleFormat, url, docelement, monitorTitle, fortest)
    if title:
        page['title'] = title
    if not titleeEements:
        return page
    if elementResult is not None:
        elementResult['titles'] = titleeEements
    titleElement, contentElement = contentparser.parse(titleeEements)
    if titleElement is not None:
        page['title'] = lxmlutil.getCleanText(titleElement)
    if elementResult is not None and titleElement is not None:
        elementResult['element'] = {}
        elementResult['text'] = {}

        elementResult['element']['title'] = (titleElement.tag, titleElement.sourceline)
        elementResult['text']['title'] = lxmlutil.getCleanText(titleElement)

        elementResult['element']['content'] = (contentElement.tag, contentElement.sourceline)
        elementResult['text']['content'] = lxmlutil.getCleanText(contentElement)

    paragraphFormat = editorFormat.get('paragraph', {})
    mainElement, paragraphs = paragraphparser.parse(paragraphFormat, contentElement, titleElement)
    if paragraphs:
        page['paragraphs'] = paragraphs
        page['content'] = digestparser.parse(paragraphFormat, paragraphs)
    if elementResult is not None and mainElement is not None:
        elementResult['element']['main'] = (mainElement.tag, mainElement.sourceline)
        elementResult['text']['main'] = lxmlutil.getCleanText(mainElement)

    if paragraphs:
        publishedElement = None
        publishedFormat = editorFormat.get('published', {})
        publishedResult = publishedparser.parse(publishedFormat, titleElement, mainElement)
        if publishedResult:
            page['publishedtext'] = publishedResult[1]
            page['published'] = publishedResult[2]
            publishedElement = publishedResult[0]
        if elementResult is not None and publishedElement is not None:
            elementResult['element']['published'] = (publishedElement.tag, publishedElement.sourceline)
            if publishedElement is not None:
                elementResult['text']['published'] = lxmlutil.getCleanText(publishedElement)

        images = imgparser.parse(url, contentElement, titleElement, mainElement)
        if images:
            page['images'] = images

    return page

예제 #2

파일 보기

파일: contentparser.py 프로젝트: innocencelin/newseditor

def _getMainElement(titleElement):
    parent = titleElement
    p_parent = titleElement.getparent()
    if p_parent is None:
        return None

    result = []
    while p_parent is not None:
        len1 = len(lxmlutil.getCleanText(parent))
        len2 = len(lxmlutil.getCleanText(p_parent))
        # title and parent element should as close as possible.
        weight = (len2 - len1) - math.pow(titleElement.sourceline - p_parent.sourceline, 2)
        result.append((weight, p_parent))
        parent = p_parent
        p_parent = p_parent.getparent()

    return max(result, key=lambda item: item[0])

예제 #3

파일 보기

def getValueBySelectors(element, selectors):
    result = None
    for selector in selectors:
        matched = getElementValue(element, selector)
        if matched is not None:
            if isinstance(matched, basestring):
                result = matched
            else:
                result = lxmlutil.getCleanText(matched)
        if result:
            break
    return result

예제 #4

파일 보기

파일: htmlcontentparser.py 프로젝트: economylin/newsmonitor

def getValueBySelectors(element, selectors):
    result = None
    for selector in selectors:
        matched = getElementValue(element, selector)
        if matched is not None:
            if isinstance(matched, basestring):
                result = matched
            else:
                result = lxmlutil.getCleanText(matched)
        if result:
            break
    return result

예제 #5

파일 보기

파일: paragraphparser.py 프로젝트: innocencelin/newseditor

def _getParagraphsByTag(element, tag):
    result = []
    for item in element.getchildren():
        if item.tag != tag:
            continue
        content = lxmlutil.getCleanText(item)
        if not content:
            content = item.tail
            if content:
                content = content.strip()
        if content:
            result.append(content)
    return result

예제 #6

파일 보기

파일: detaildetector.py 프로젝트: economylin/newsmonitor

def _detectDetailUrl(url, title):
    tried = 2
    fetcher = ContentFetcher(url,tried=tried)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return None
    docelement = lxml.html.fromstring(content)
    aElements = pyquery.PyQuery(docelement)('a')
    for aElement in aElements:
        if lxmlutil.getCleanText(aElement) != title:
            continue
        detailUrl = aElement.get('href')
        if detailUrl:
            detailUrl = urlparse.urljoin(url, detailUrl)
            return detailUrl
    return None

예제 #7

파일 보기

def _detectDetailUrl(url, title):
    tried = 2
    fetcher = ContentFetcher(url, tried=tried)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return None
    docelement = lxml.html.fromstring(content)
    aElements = pyquery.PyQuery(docelement)('a')
    for aElement in aElements:
        if lxmlutil.getCleanText(aElement) != title:
            continue
        detailUrl = aElement.get('href')
        if detailUrl:
            detailUrl = urlparse.urljoin(url, detailUrl)
            return detailUrl
    return None

예제 #8

파일 보기

파일: paragraphparser.py 프로젝트: innocencelin/newseditor

def _getParagraphLengthByLink(element):
    if element.tag == 'li':
        return 0
    result = 0
    if element.text:
        result += len(element.text.strip())
    for item in element.getchildren():
        # treat br specially, it is used as paragraph separator by some site
        if item.tag == 'br':
            continue
        if item.tag not in lxmlutil.INLINE_TAGS:
            continue
        text = lxmlutil.getCleanText(item)
        if text:
            result += len(text)
        if item.tail:
            result += len(item.tail.strip())
    return result