def analyse(url, content, editorFormat, monitorTitle=None, fortest=False, elementResult={}): page = {} docelement = lxml.html.fromstring(content) titleFormat = editorFormat.get('title', {}) title, titleeEements = titleparser.parse(titleFormat, url, docelement, monitorTitle, fortest) if title: page['title'] = title if not titleeEements: return page if elementResult is not None: elementResult['titles'] = titleeEements titleElement, contentElement = contentparser.parse(titleeEements) if titleElement is not None: page['title'] = lxmlutil.getCleanText(titleElement) if elementResult is not None and titleElement is not None: elementResult['element'] = {} elementResult['text'] = {} elementResult['element']['title'] = (titleElement.tag, titleElement.sourceline) elementResult['text']['title'] = lxmlutil.getCleanText(titleElement) elementResult['element']['content'] = (contentElement.tag, contentElement.sourceline) elementResult['text']['content'] = lxmlutil.getCleanText(contentElement) paragraphFormat = editorFormat.get('paragraph', {}) mainElement, paragraphs = paragraphparser.parse(paragraphFormat, contentElement, titleElement) if paragraphs: page['paragraphs'] = paragraphs page['content'] = digestparser.parse(paragraphFormat, paragraphs) if elementResult is not None and mainElement is not None: elementResult['element']['main'] = (mainElement.tag, mainElement.sourceline) elementResult['text']['main'] = lxmlutil.getCleanText(mainElement) if paragraphs: publishedElement = None publishedFormat = editorFormat.get('published', {}) publishedResult = publishedparser.parse(publishedFormat, titleElement, mainElement) if publishedResult: page['publishedtext'] = publishedResult[1] page['published'] = publishedResult[2] publishedElement = publishedResult[0] if elementResult is not None and publishedElement is not None: elementResult['element']['published'] = (publishedElement.tag, publishedElement.sourceline) if publishedElement is not None: elementResult['text']['published'] = lxmlutil.getCleanText(publishedElement) images = imgparser.parse(url, contentElement, titleElement, mainElement) if images: page['images'] = images return page
def _getMainElement(titleElement): parent = titleElement p_parent = titleElement.getparent() if p_parent is None: return None result = [] while p_parent is not None: len1 = len(lxmlutil.getCleanText(parent)) len2 = len(lxmlutil.getCleanText(p_parent)) # title and parent element should as close as possible. weight = (len2 - len1) - math.pow(titleElement.sourceline - p_parent.sourceline, 2) result.append((weight, p_parent)) parent = p_parent p_parent = p_parent.getparent() return max(result, key=lambda item: item[0])
def getValueBySelectors(element, selectors): result = None for selector in selectors: matched = getElementValue(element, selector) if matched is not None: if isinstance(matched, basestring): result = matched else: result = lxmlutil.getCleanText(matched) if result: break return result
def _getParagraphsByTag(element, tag): result = [] for item in element.getchildren(): if item.tag != tag: continue content = lxmlutil.getCleanText(item) if not content: content = item.tail if content: content = content.strip() if content: result.append(content) return result
def _detectDetailUrl(url, title): tried = 2 fetcher = ContentFetcher(url,tried=tried) fetchResult = fetcher.fetch() content = fetchResult.get('content') if not content: return None docelement = lxml.html.fromstring(content) aElements = pyquery.PyQuery(docelement)('a') for aElement in aElements: if lxmlutil.getCleanText(aElement) != title: continue detailUrl = aElement.get('href') if detailUrl: detailUrl = urlparse.urljoin(url, detailUrl) return detailUrl return None
def _detectDetailUrl(url, title): tried = 2 fetcher = ContentFetcher(url, tried=tried) fetchResult = fetcher.fetch() content = fetchResult.get('content') if not content: return None docelement = lxml.html.fromstring(content) aElements = pyquery.PyQuery(docelement)('a') for aElement in aElements: if lxmlutil.getCleanText(aElement) != title: continue detailUrl = aElement.get('href') if detailUrl: detailUrl = urlparse.urljoin(url, detailUrl) return detailUrl return None
def _getParagraphLengthByLink(element): if element.tag == 'li': return 0 result = 0 if element.text: result += len(element.text.strip()) for item in element.getchildren(): # treat br specially, it is used as paragraph separator by some site if item.tag == 'br': continue if item.tag not in lxmlutil.INLINE_TAGS: continue text = lxmlutil.getCleanText(item) if text: result += len(text) if item.tail: result += len(item.tail.strip()) return result