Example #1
0
    def parse(self):
        super(SearchResource, self).parse()
        
        self.teasers = []
        self.resultsPerPage = 0
        self.results = 0

        pos = 0
        resultsMatch = resultsPattern.search(self.content, pos)
        if resultsMatch is not None:
            loadMoreSize = resultsMatch.group(1)
            if loadMoreSize is not None and loadMoreSize != '':
                self.resultsPerPage = int(loadMoreSize)
            loadMoreCount = resultsMatch.group(2)
            if loadMoreCount is None or loadMoreCount == '':
                return;
            self.results = int(loadMoreCount)
            pos = resultsMatch.end(0)
            
        prevPos = 0
        while pos != -1:
            teaser = Teaser()
            prevPos = pos
            pos = teaser.parse(self.content, pos, self._getBaseUrl())
            if teaser.valid():
                self.teasers.append(teaser)
        
        loadMoreMatch = loadMorePattern.search(self.content, prevPos)
        self.moreUrl = None
        if loadMoreMatch is not None:
            self.moreUrl = loadMoreMatch.group(1).strip()
Example #2
0
 def _parseModule(self, pos, contentPattern, textPattern, datePattern):
     match = None
     
     teaser = Teaser()
     pos = teaser.parseApiToken(self.content, pos)
     self.apiToken = teaser.apiToken
     
     return match
Example #3
0
    def parse(self):
        super(TeaserLazyloadResource, self).parse()

        teaser = Teaser()
        pos = 0
        teaserMatch = self.teaserLazyload.teaserPattern.search(
            self.content, pos)
        teaser.parse(self.content, pos, self.teaserLazyload.baseUrl,
                     teaserMatch)

        self.teaser = None
        if teaser.valid():
            #teaser.title = "LL: " + teaser.title
            self.teaser = teaser
Example #4
0
    def _parseClusterTeasers(self, cluster):
        itemPattern = sectionItemPattern
        if cluster.listType == 'cluster':
            itemPattern = clusterItemPattern
        pos = cluster.listStart
        itemMatch = itemPattern.search(self.content, pos)
        while pos < cluster.listEnd and itemMatch is not None:
            teaser = Teaser()
            pos = teaser.parse(self.content, pos, self._getBaseUrl(), itemMatch)
            if teaser.valid():
                cluster.teasers.append(teaser)

            itemMatch = itemPattern.search(self.content, pos)
            if itemMatch is not None:
                pos = itemMatch.start(0)
Example #5
0
    def _parseClusterTeasers(self, cluster):
        itemPattern = sectionItemPattern
        if cluster.listType == 'cluster':
            itemPattern = clusterItemPattern
        pos = cluster.listStart
        itemMatch = itemPattern.search(self.content, pos)
        while pos < cluster.listEnd and itemMatch is not None:
            teaser = Teaser()
            pos = teaser.parse(self.content, pos, self._getBaseUrl(),
                               itemMatch)
            if teaser.valid():
                cluster.teasers.append(teaser)

            itemMatch = itemPattern.search(self.content, pos)
            if itemMatch is not None:
                pos = itemMatch.start(0)
 def _createTeaser(self, itemMatch, itemPattern):
     class_ = itemMatch.group(1)
     teaser = None
     if class_.find('lazyload') != -1:
         teaser = TeaserLazyload(itemPattern)
     else:
         teaser = Teaser()
     return teaser
    def parse(self):
        super(LiveTvResource, self).parse()
        livetvCellMatch = livetvCellPattern.search(self.content)
        if livetvCellMatch is None:
            #self.warn("can't find live-tv cells in page '{}', no channels will be available ...", self.url)
            return

        self.teasers = []
        while livetvCellMatch is not None:
            pos = livetvCellMatch.end(0)
            teaser = Teaser()
            pos = self._parseTitle(pos, teaser)
            pos = self._parseContentName(pos, teaser)
            pos = self._parseImage(pos, teaser)
            if teaser.title is not None and teaser.contentName is not None:
                self.teasers.append(teaser)
            livetvCellMatch = livetvCellPattern.search(self.content, pos)
Example #8
0
    def _parseModule(self, pos):
        match = listPattern.search(self.content, pos)

        moduleItemMatch = moduleItemPattern.search(self.content, pos)
        if moduleItemMatch is not None:
            pos = moduleItemMatch.end(0)
            end = len(self.content) - 1
            if match is not None:
                end = match.end(0)
            item = self.content[pos:end]
            teaser = Teaser()
            p = teaser.parseLabel(item, 0)
            p = teaser.parseCategory(item, p)
            p = teaser.parseTitle(item, p, self._getBaseUrl())
            p = teaser.parseText(item, p, moduleItemTextPattern)
            p = teaser.parseDate(item, p, moduleItemDatePattern)
            if teaser.valid():
                self.teasers.append(teaser)

        return match
Example #9
0
    def _parseModule(self, pos):
        match = listPattern.search(self.content, pos)

        moduleItemMatch = moduleItemPattern.search(self.content, pos)
        if moduleItemMatch is not None:
            pos = moduleItemMatch.end(0)
            end = len(self.content)-1
            if match is not None:
                end = match.end(0)
            item = self.content[pos:end]
            teaser = Teaser()
            p = teaser.parseLabel(item, 0)
            p = teaser.parseCategory(item, p)
            p = teaser.parseTitle(item, p, self._getBaseUrl())
            p = teaser.parseText(item, p, moduleItemTextPattern)
            p = teaser.parseDate(item, p, moduleItemDatePattern)
            if teaser.valid():
                self.teasers.append(teaser)

        return match
    def _parseModuleRange(self,
                          pos,
                          end,
                          contentPattern,
                          textPattern,
                          datePattern,
                          moduleType,
                          cluster=None):
        item = self.content[pos:end]
        pos = 0

        teaser = Teaser()
        pos = teaser.parseApiToken(item, pos)

        contentMatch = contentPattern.search(item, pos)
        if contentMatch is not None:
            pos = contentMatch.end(0)
            # the teaser image for videos is encoded in the video players json parameter
            if teaser.apiToken is not None:
                p = teaser.parseImage(item, 0, moduleItemVideoPattern)
                image = teaser.image
                if image is not None:
                    image = image.replace('\\', '')
                    teaser.image = image
            else:
                p = teaser.parseImage(item, 0, moduleItemImagePattern)
            if moduleType == MODULE_TYPE_POST_CONTENT:
                url = urlparse(self.url)
                teaser.url = url.path
                teaser.title = self.fallbackTitle
            else:
                p = teaser.parseLabel(item, pos)
                p = teaser.parseCategory(item, p)
                p = teaser.parseTitle(item, p, self._getBaseUrl())
            p = teaser.parseText(item, p, textPattern)
            p = teaser.parseFoot(item, p)
            if teaser.valid():
                teasers = self.teasers
                if cluster is not None:
                    teasers = cluster.teasers
                teasers.append(teaser)
        print teaser

rubricResource = RubricResource(baseUrl + rubric, firstCluster.listType,
                                firstCluster.listStart, firstCluster.listEnd)
rubricResource.parse()
for cluster in rubricResource.clusters:
    print cluster
    for teaser in cluster.teasers:
        print teaser

html = getUrl(searchUrl)

#teaser = getTag('article', 'b-content-teaser-item')
pos = 0
while pos != -1:
    teaser = Teaser()
    pos = teaser.parse(html, pos)
    if teaser.valid():
        print teaser
    else:
        print "invalid teaser"

#pattern = re.compile('.*<article class="b-content-teaser-item x-column">')
pattern = re.compile('<article\s*class=[^"]*b-content-teaser-item[^"]*"\s*>',
                     re.DOTALL)
match = pattern.search(html)
print match
if match is not None:
    i = match.start(0)
    j = html.find('</article>', i) + len('</article>')
    teaser = html[i:j]