def parse(self): super(SearchResource, self).parse() self.teasers = [] self.resultsPerPage = 0 self.results = 0 pos = 0 resultsMatch = resultsPattern.search(self.content, pos) if resultsMatch is not None: loadMoreSize = resultsMatch.group(1) if loadMoreSize is not None and loadMoreSize != '': self.resultsPerPage = int(loadMoreSize) loadMoreCount = resultsMatch.group(2) if loadMoreCount is None or loadMoreCount == '': return; self.results = int(loadMoreCount) pos = resultsMatch.end(0) prevPos = 0 while pos != -1: teaser = Teaser() prevPos = pos pos = teaser.parse(self.content, pos, self._getBaseUrl()) if teaser.valid(): self.teasers.append(teaser) loadMoreMatch = loadMorePattern.search(self.content, prevPos) self.moreUrl = None if loadMoreMatch is not None: self.moreUrl = loadMoreMatch.group(1).strip()
def _parseModule(self, pos, contentPattern, textPattern, datePattern): match = None teaser = Teaser() pos = teaser.parseApiToken(self.content, pos) self.apiToken = teaser.apiToken return match
def parse(self): super(TeaserLazyloadResource, self).parse() teaser = Teaser() pos = 0 teaserMatch = self.teaserLazyload.teaserPattern.search( self.content, pos) teaser.parse(self.content, pos, self.teaserLazyload.baseUrl, teaserMatch) self.teaser = None if teaser.valid(): #teaser.title = "LL: " + teaser.title self.teaser = teaser
def _parseClusterTeasers(self, cluster): itemPattern = sectionItemPattern if cluster.listType == 'cluster': itemPattern = clusterItemPattern pos = cluster.listStart itemMatch = itemPattern.search(self.content, pos) while pos < cluster.listEnd and itemMatch is not None: teaser = Teaser() pos = teaser.parse(self.content, pos, self._getBaseUrl(), itemMatch) if teaser.valid(): cluster.teasers.append(teaser) itemMatch = itemPattern.search(self.content, pos) if itemMatch is not None: pos = itemMatch.start(0)
def _createTeaser(self, itemMatch, itemPattern): class_ = itemMatch.group(1) teaser = None if class_.find('lazyload') != -1: teaser = TeaserLazyload(itemPattern) else: teaser = Teaser() return teaser
def parse(self): super(LiveTvResource, self).parse() livetvCellMatch = livetvCellPattern.search(self.content) if livetvCellMatch is None: #self.warn("can't find live-tv cells in page '{}', no channels will be available ...", self.url) return self.teasers = [] while livetvCellMatch is not None: pos = livetvCellMatch.end(0) teaser = Teaser() pos = self._parseTitle(pos, teaser) pos = self._parseContentName(pos, teaser) pos = self._parseImage(pos, teaser) if teaser.title is not None and teaser.contentName is not None: self.teasers.append(teaser) livetvCellMatch = livetvCellPattern.search(self.content, pos)
def _parseModule(self, pos): match = listPattern.search(self.content, pos) moduleItemMatch = moduleItemPattern.search(self.content, pos) if moduleItemMatch is not None: pos = moduleItemMatch.end(0) end = len(self.content) - 1 if match is not None: end = match.end(0) item = self.content[pos:end] teaser = Teaser() p = teaser.parseLabel(item, 0) p = teaser.parseCategory(item, p) p = teaser.parseTitle(item, p, self._getBaseUrl()) p = teaser.parseText(item, p, moduleItemTextPattern) p = teaser.parseDate(item, p, moduleItemDatePattern) if teaser.valid(): self.teasers.append(teaser) return match
def _parseModule(self, pos): match = listPattern.search(self.content, pos) moduleItemMatch = moduleItemPattern.search(self.content, pos) if moduleItemMatch is not None: pos = moduleItemMatch.end(0) end = len(self.content)-1 if match is not None: end = match.end(0) item = self.content[pos:end] teaser = Teaser() p = teaser.parseLabel(item, 0) p = teaser.parseCategory(item, p) p = teaser.parseTitle(item, p, self._getBaseUrl()) p = teaser.parseText(item, p, moduleItemTextPattern) p = teaser.parseDate(item, p, moduleItemDatePattern) if teaser.valid(): self.teasers.append(teaser) return match
def _parseModuleRange(self, pos, end, contentPattern, textPattern, datePattern, moduleType, cluster=None): item = self.content[pos:end] pos = 0 teaser = Teaser() pos = teaser.parseApiToken(item, pos) contentMatch = contentPattern.search(item, pos) if contentMatch is not None: pos = contentMatch.end(0) # the teaser image for videos is encoded in the video players json parameter if teaser.apiToken is not None: p = teaser.parseImage(item, 0, moduleItemVideoPattern) image = teaser.image if image is not None: image = image.replace('\\', '') teaser.image = image else: p = teaser.parseImage(item, 0, moduleItemImagePattern) if moduleType == MODULE_TYPE_POST_CONTENT: url = urlparse(self.url) teaser.url = url.path teaser.title = self.fallbackTitle else: p = teaser.parseLabel(item, pos) p = teaser.parseCategory(item, p) p = teaser.parseTitle(item, p, self._getBaseUrl()) p = teaser.parseText(item, p, textPattern) p = teaser.parseFoot(item, p) if teaser.valid(): teasers = self.teasers if cluster is not None: teasers = cluster.teasers teasers.append(teaser)
print teaser rubricResource = RubricResource(baseUrl + rubric, firstCluster.listType, firstCluster.listStart, firstCluster.listEnd) rubricResource.parse() for cluster in rubricResource.clusters: print cluster for teaser in cluster.teasers: print teaser html = getUrl(searchUrl) #teaser = getTag('article', 'b-content-teaser-item') pos = 0 while pos != -1: teaser = Teaser() pos = teaser.parse(html, pos) if teaser.valid(): print teaser else: print "invalid teaser" #pattern = re.compile('.*<article class="b-content-teaser-item x-column">') pattern = re.compile('<article\s*class=[^"]*b-content-teaser-item[^"]*"\s*>', re.DOTALL) match = pattern.search(html) print match if match is not None: i = match.start(0) j = html.find('</article>', i) + len('</article>') teaser = html[i:j]