Beispiel #1
0
    def _parseCluster(self, pos, class_, fallbackTitle):
        titlePattern = clusterTitlePattern
        listType = 'cluster'
        if class_.find('b-content-teaser-list') != -1:
            titlePattern = sectionTitlePattern
            listType = 'content'

        titleMatch = titlePattern.search(self.content, pos)
        cluster = None
        title = fallbackTitle
        if class_.find('x-notitle') != -1:
            if len(self.clusters) > 0:
                cluster = self.clusters[len(self.clusters) - 1]
        elif titleMatch is not None:
            title = stripHtml(titleMatch.group(1))
            pos = titleMatch.end(0)

        if cluster is None:
            cluster = Cluster(title, listType, pos)
            self.clusters.append(cluster)

        match = listPattern.search(self.content, pos)

        if match is not None:
            cluster.listEnd = match.start(0) - 1
        else:
            cluster.listEnd = len(self.content) - 1
        return match
Beispiel #2
0
    def _parseClusters(self):

        pos = 0
        title = None
        fallbackTitleMatch = fallbackTitlePattern.search(self.content, pos)
        if fallbackTitleMatch is None:
            fallbackTitleMatch = fallbackTitlePattern2.search(
                self.content, pos)
        if fallbackTitleMatch is not None:
            title = stripHtml(fallbackTitleMatch.group(1))
            pos = fallbackTitleMatch.end(0)

        match = listPattern.search(self.content, pos)
        while match is not None:
            pos = match.end(0)
            class_ = match.group(1)
            if self._isModule(class_):
                match = self._parseModule(pos, moduleItemPattern,
                                          moduleItemTextPattern,
                                          moduleItemDatePattern)
            elif self._isStageTeaser(class_):
                match = self._parseModule(pos, stageTeaserPattern,
                                          stageTeaserTextPattern,
                                          moduleItemDatePattern)
            else:
                match = self._parseCluster(pos, class_, title)
Beispiel #3
0
    def parseCategory(self, article, pos):
        catMatch = catPattern.search(article, pos)
        genre = None
        category = None

        if catMatch is not None:
            parts = catMatch.group(1).strip().split('|')
            if len(parts) > 0:
                genre = parts[0].strip()
            if len(parts) > 1:
                category = parts[1].strip()
            pos = catMatch.end(0)

        self.genre = stripHtml(genre)
        self.category = stripHtml(category)
        return pos
Beispiel #4
0
    def parseCategory(self, article, pos):
        catMatch = catPattern.search(article, pos)
        genre = None
        category = None

        if catMatch is not None:
            parts = catMatch.group(1).strip().split('|')
            if len(parts) > 0:
                genre = parts[0].strip()
            if len(parts) > 1:
                category = parts[1].strip()
            pos = catMatch.end(0)
            
        self.genre = stripHtml(genre)
        self.category = stripHtml(category)
        return pos
    def _parseCluster(self, pos, class_, fallbackTitle):
        titlePattern = clusterTitlePattern
        listType = 'cluster'
        if class_.find('b-content-teaser-list') != -1:
            titlePattern = sectionTitlePattern
            listType = 'content'
            
        titleMatch = titlePattern.search(self.content, pos)
        cluster = None
        title = fallbackTitle
        if class_.find('x-notitle') != -1:
            if len(self.clusters) > 0:
                cluster = self.clusters[len(self.clusters)-1]
        elif titleMatch is not None:
            title = stripHtml(titleMatch.group(1))
            pos = titleMatch.end(0)

        if cluster is None:
            cluster = Cluster(title, listType, pos)
            self.clusters.append(cluster)
        
        match = listPattern.search(self.content, pos)

        if match is not None:
            cluster.listEnd = match.start(0)-1
        else:
            cluster.listEnd = len(self.content)-1
        return match
Beispiel #6
0
    def parseTitle(self, article, pos, baseUrl):
        aMatch = aPattern.search(article, pos)
        title = None
        url = None
        if aMatch is not None:
            url = aMatch.group(1).strip()
            pos = aMatch.end(0)
            i = pos
            j = article.find('</a>', i)
            # check for '<span class="arrowhover ...'
            k = article.find('<span class="arrowhover', i)
            if k != -1 and k < j:
                j = k
            title = article[i:j]
            title = cleanTags(title)
            title = title.strip()
            pos = j + len('</a>')

        self.title = stripHtml(title)
        self.url = url
        self.contentName = None
        if url is not None:
            if baseUrl is not None and url[0:len(baseUrl)] == baseUrl:
                self.url = url[len(baseUrl):]
            i = url.rfind('.')
            if i != -1:
                self.contentName = '/zdf' + url[0:i]
        return pos
    def _parseCluster(self, pos, class_, fallbackTitle):
        titlePattern = clusterTitlePattern
        listType = 'cluster'
        if class_.find('b-content-teaser-list') != -1:
            titlePattern = sectionTitlePattern
            listType = 'content'
        elif class_.find('b-newsstream') != -1:
            titlePattern = newsStreamTitlePattern
            listType = 'cluster'
        elif class_.find('b-topics-module') != -1:
            titlePattern = topicsModuleTitlePattern
            listType = 'topics'

        titleMatch = titlePattern.search(self.content, pos)
        cluster = None
        title = fallbackTitle
        # if content-teaser-list has no title, use previous cluster to calculate list end
        if class_.find('b-content-teaser-list no-title') != -1:
            if len(self.clusters) > 0:
                cluster = self.clusters[len(self.clusters) - 1]
            else:
                nextClusterMatch = listPattern.search(self.content, pos)
                tmpCluster = Cluster(None, listType, pos,
                                     nextClusterMatch.end(0))
                self._parseClusterTeasers(tmpCluster)
                self.teasers.extend(tmpCluster.teasers)
                return nextClusterMatch

        elif titleMatch is not None:
            # title can be None in case of 'x-notitle' in 'topics' list
            title = stripHtml(titleMatch.group(1))
            pos = titleMatch.end(0)

        if cluster is None:
            cluster = Cluster(title, listType, pos)
            self.clusters.append(cluster)

        match = listPattern.search(self.content, pos)

        if match is not None:
            cluster.listEnd = match.start(0) - 1
        else:
            cluster.listEnd = len(self.content) - 1

        # use first teaser image as cluster image
        if cluster.image is None:
            tmpCluster = Cluster(None, listType, cluster.listStart,
                                 cluster.listEnd)
            self._parseClusterTeasers(tmpCluster, True)
            if len(tmpCluster.teasers) > 0:
                tmpTeaser = tmpCluster.teasers[0]
                cluster.image = tmpTeaser.image
                # use teaser.title as cluster fallback
                if cluster.title is None:
                    cluster.title = tmpTeaser.title

        return match
Beispiel #8
0
    def parseText(self, article, pos, pattern=textPattern):
        textMatch = pattern.search(article, pos)
        text = None
        if textMatch is not None:
            text = textMatch.group(1).strip()
            pos = textMatch.end(0)

        self.text = stripHtml(text)
        return pos
Beispiel #9
0
    def parseText(self, article, pos, pattern=textPattern):
        textMatch = pattern.search(article, pos)
        text = None
        if textMatch is not None:
            text = textMatch.group(1).strip()
            pos = textMatch.end(0)

        self.text = stripHtml(text)
        return pos
Beispiel #10
0
    def parseCategory(self, article, pos):
        catMatch = catPattern.search(article, pos)
        genre = None
        category = None

        if catMatch is not None:
            pos = catMatch.end(0)

            catCategoryMatch = catCategoryPattern.search(article, pos)
            if catCategoryMatch is not None:
                genre = catCategoryMatch.group(1).strip()
                pos = catCategoryMatch.end(0)

            catBrandMatch = catBrandPattern.search(article, pos)
            if catBrandMatch is not None:
                category = catBrandMatch.group(1).strip()
                pos = catBrandMatch.end(0)

        self.genre = stripHtml(genre)
        self.category = stripHtml(category)
        return pos
Beispiel #11
0
    def _parseClusters(self):

        pos = 0
        title = None
        fallbackTitleMatch = fallbackTitlePattern.search(self.content, pos)
        if fallbackTitleMatch is not None:
            title = stripHtml(fallbackTitleMatch.group(1))
            pos = fallbackTitleMatch.end(0)

        match = listPattern.search(self.content, pos)
        while match is not None:
            pos = match.end(0)
            class_ = match.group(1)
            if class_.find('b-content-module') != -1:
                match = self._parseModule(pos)
            else:
                match = self._parseCluster(pos, class_, title)
Beispiel #12
0
 def _parseClusters(self):
         
     pos = 0
     title = None
     fallbackTitleMatch = fallbackTitlePattern.search(self.content, pos)
     if fallbackTitleMatch is None:
         fallbackTitleMatch = fallbackTitlePattern2.search(self.content, pos)
     if fallbackTitleMatch is not None:
         title = stripHtml(fallbackTitleMatch.group(1))
         pos = fallbackTitleMatch.end(0)
         
     match = listPattern.search(self.content, pos)
     while match is not None:
         pos = match.end(0)
         class_ = match.group(1)
         if class_.find('b-content-module') != -1:
             match = self._parseModule(pos)
         else:
             match = self._parseCluster(pos, class_, title)
Beispiel #13
0
    def parseLabel(self, article, pos):
        labelMatch = labelPattern.search(article, pos)
        label = None
        type = None
        if labelMatch is not None:
            labelTags = getTag('div', article, labelMatch)
            iconMatch = iconPattern.search(labelTags)
            if iconMatch is not None:
                type = iconMatch.group(1)
            i = labelTags.find('>') + len('>')
            j = labelTags.rfind('</div>')
            pos = j + len('</div>')
            label = labelTags[i:j]
            label = stripTag('abbr', label)
            label = cleanTags(label)
            label = label.strip()

        self.label = stripHtml(label)
        self.type = type
        return pos
    def parse(self):
        super(NavigationResource, self).parse()
        leftNavMatch = leftNavPattern.search(self.content)
        if leftNavMatch is None:
            self.warn(
                "can't find navigation in page '{}', no rubrics will be available ...",
                self.url)
            return

        leftNav = getTag('ul', self.content, leftNavMatch)

        pos = leftNavMatch.end(0)
        dropdownLinksMatch = dropdownLinksPattern.search(self.content, pos)
        self.rubrics = []
        while dropdownLinksMatch is not None:
            url = dropdownLinksMatch.group(1).strip()
            title = stripHtml(dropdownLinksMatch.group(2))
            rubric = Rubric(title, url)
            self.rubrics.append(rubric)
            pos = dropdownLinksMatch.end(0)
            dropdownLinksMatch = dropdownLinksPattern.search(self.content, pos)
Beispiel #15
0
    def parseLabel(self, article, pos):
        labelMatch = labelPattern.search(article, pos)
        label = None
        type = None
        if labelMatch is not None:        
            labelTags = getTag('div', article, labelMatch)
            iconMatch = iconPattern.search(labelTags)
            if iconMatch is not None:    
                type = iconMatch.group(1)
            i = labelTags.find('</span>') + len('</span>')
            j = labelTags.rfind('</div>')
            pos = j + len('</div>') 
            label = labelTags[i:j]
            label = label.replace('<strong>', '')
            label = label.replace('</strong>', '')
            label = stripTag('abbr', label)
            label = stripTag('span', label)
            label = label.strip()

        self.label = stripHtml(label)
        self.type = type
        return pos
    def parse(self):
        super(NavigationResource, self).parse()
        leftNavMatch = leftNavPattern.search(self.content)
        if leftNavMatch is None:
            self.warn("can't find navigation in page '{}', no rubrics will be available ...", self.url)
            return

        leftNav = getTag('ul', self.content, leftNavMatch)     

        pos = leftNavMatch.end(0)
        dropdownLinksMatch = dropdownLinksPattern.search(self.content, pos)
        self.rubrics = []
        urls = Set([]);
        while dropdownLinksMatch is not None:
            url = self.parseUrl(dropdownLinksMatch.group(1))
            if url not in urls:
                urls.add(url)
                title = stripHtml(dropdownLinksMatch.group(2))
                rubric = Rubric(title, url)
                self.rubrics.append(rubric)
            pos = dropdownLinksMatch.end(0)
            dropdownLinksMatch = dropdownLinksPattern.search(self.content, pos)
Beispiel #17
0
    def parseTitle(self, article, pos, baseUrl):
        aMatch = aPattern.search(article, pos)
        title = None
        url = None
        playable = False
        if aMatch is not None:
            url = aMatch.group(1).strip()
            pos = aMatch.end(0)
            i = pos
            iconMatch = titleIconPattern.search(article, pos)
            if iconMatch is not None:
                playable = iconMatch.group(1) == 'play'
                i = article.find('</span>', pos) + len('</span>')

            j = article.find('</a>', i)
            # check for '<span class="arrowhover ...'
            k = article.find('<span', i)
            if k != -1 and k < j:
                j = k
            title = article[i:j]
            title = title.replace('<strong>', '')
            title = title.replace('</strong>', '')
            title = title.strip()
            pos = j + len('</a>')

        self.title = stripHtml(title)
        self.url = url
        self.playable = playable
        self.contentName = None
        if url is not None:
            if baseUrl is not None and url[0:len(baseUrl)] == baseUrl:
                self.url = url[len(baseUrl):]
            i = url.rfind('.')
            if i != -1:
                j = url.rfind('/')
                if j != -1:
                    self.contentName = url[j + 1:i]
        return pos
    def _parseClusters(self):

        pos = 0
        title = None
        fallbackTitleMatch = fallbackTitlePattern.search(self.content, pos)
        if fallbackTitleMatch is None:
            fallbackTitleMatch = fallbackTitlePattern2.search(
                self.content, pos)
        if fallbackTitleMatch is not None:
            title = stripTag('span', fallbackTitleMatch.group(1))
            title = stripHtml(title)
            pos = fallbackTitleMatch.end(0)
        self.fallbackTitle = title

        match = listPattern.search(self.content, pos)
        while match is not None:
            pos = match.end(0)
            class_ = match.group(1)
            if self._isModule(class_):
                match = self._parseModule(pos, moduleItemPattern,
                                          moduleItemTextPattern,
                                          moduleItemDatePattern,
                                          MODULE_TYPE_DEFAULT)
            elif self._isPostContent(class_):
                match = self._parseModule(pos, postContentPattern,
                                          moduleItemTextPattern,
                                          moduleItemDatePattern,
                                          MODULE_TYPE_POST_CONTENT)
            elif self._isStageTeaser(class_):
                match = self._parseModule(pos, stageTeaserPattern,
                                          stageTeaserTextPattern,
                                          moduleItemDatePattern,
                                          MODULE_TYPE_STAGE_TEASER)
            elif self._isGroupPersons(class_):
                # just skip group persons, no teasers in this section
                match = listPattern.search(self.content, pos)
            else:
                match = self._parseCluster(pos, class_, title)
Beispiel #19
0
    def parseTitle(self, article, pos, baseUrl):
        aMatch = aPattern.search(article, pos)
        title = None
        url = None
        playable = False
        if aMatch is not None:
            url = aMatch.group(1).strip()        
            pos = aMatch.end(0)
            i = pos
            iconMatch = titleIconPattern.search(article, pos)
            if iconMatch is not None:    
                playable =  iconMatch.group(1) == 'play'
                i = article.find('</span>', pos) + len('</span>')

            j = article.find('</a>', i)
            # check for '<span class="arrowhover ...'
            k = article.find('<span class="arrowhover', i)
            if k != -1 and k < j:
                j = k
            title = article[i:j]
            title = cleanTags(title)
            title = title.strip()
            pos = j + len('</a>') 
    
        self.title = stripHtml(title)
        self.url = url
        self.playable = playable
        self.contentName = None
        if url is not None:
            if baseUrl is not None and url[0:len(baseUrl)] == baseUrl:
                self.url = url[len(baseUrl):]
            i = url.rfind('.')
            if i != -1:
                j = url.rfind('/')
                if j != -1:
                    self.contentName = url[j+1:i]
        return pos