def cleanHtmlStr(str):
     return CParsingHelper.cleanHtmlStr(str)
Example #2
0
    def parseListBase(self, data, type='video'):
        printDBG("parseListBase----------------")
        urlPatterns = {
            'video': ['video', 'href="[ ]*?(/watch\?v=[^"]+?)"', ''],
            'channel': ['category', 'href="(/[^"]+?)"', ''],
            'playlist': ['category', 'list=([^"]+?)"', '/playlist?list='],
            'movie': ['video', 'data-context-item-id="([^"]+?)"', '/watch?v='],
            'live': ['video', 'href="(/watch\?v=[^"]+?)"', ''],
            'tray': ['video', 'data-video-id="([^"]+?)"', '/watch?v='],
        }
        currList = []
        for i in range(len(data)):
            #printDBG("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
            # get requaired params
            url = urlPatterns[type][2] + self.getAttributes(
                urlPatterns[type][1], data[i])

            # get title
            title = ''  #self.getAttributes('title="([^"]+?)"', data[i])
            if '' == title:
                title = self.getAttributes(
                    'data-context-item-title="([^"]+?)"', data[i])
            if '' == title:
                title = self.getAttributes('data-video-title="([^"]+?)"',
                                           data[i])
            if '' == title:
                sts, title = CParsingHelper.getDataBeetwenMarkers(
                    data[i], '<h3 class="yt-lockup-title">', '</h3>', False)
            if '' == title:
                sts, title = CParsingHelper.getDataBeetwenReMarkers(
                    data[i], re.compile('<span [^>]*?class="title[^>]*?>'),
                    re.compile('</span>'), False)
            if '' == title:
                sts, title = CParsingHelper.getDataBeetwenReMarkers(
                    data[i], re.compile('class="pl-video-title-link[^>]*?>'),
                    re.compile('<'), False)

            if '' == title:
                titleMarker = self.cm.ph.getSearchGroups(
                    data[i], '(<[^">]+?"yt-lockup-title[^"]*?"[^>]*?>)')[0]
                if '' != titleMarker:
                    tidx = titleMarker.find(' ')
                    if tidx > 0:
                        tmarker = titleMarker[1:tidx]
                        title = self.cm.ph.getDataBeetwenMarkers(
                            data[i], titleMarker, '</%s>' % tmarker)[1]

            if '' != title:
                title = CParsingHelper.cleanHtmlStr(title)
            if i == 0:
                printDBG(data[i])

            img = self.getAttributes('data-thumb="([^"]+?\.jpg[^"]*?)"',
                                     data[i])
            if '' == img:
                img = self.getAttributes('src="([^"]+?\.jpg[^"]*?)"', data[i])
            if '' == img:
                img = self.getAttributes('<img[^>]+?data\-thumb="([^"]+?)"',
                                         data[i])
            if '' == img:
                img = self.getAttributes('<img[^>]+?src="([^"]+?)"', data[i])
            if '.gif' in img: img = ''
            time = self.getAttributes('data-context-item-time="([^"]+?)"',
                                      data[i])
            if '' == time:
                time = self.getAttributes('class="video-time">([^<]+?)</span>',
                                          data[i])
            if '' == time:
                sts, time = CParsingHelper.getDataBeetwenReMarkers(
                    data[i], re.compile('pl-video-time"[^>]*?>'),
                    re.compile('<'), False)
            if '' == time:
                sts, time = CParsingHelper.getDataBeetwenReMarkers(
                    data[i], re.compile('timestamp"[^>]*?>'), re.compile('<'),
                    False)
            time = time.strip()

            # desc
            descTab = []

            desc = self.cm.ph.getDataBeetwenMarkers(
                data[i], '<div class="yt-lockup-meta', '</div>')[1]
            if desc != '': descTab.append(desc)
            desc = self.cm.ph.getDataBeetwenMarkers(
                data[i], '<span class="formatted-video-count', '</span>')[1]
            if desc != '': descTab.append(desc)

            desc = self.cm.ph.getDataBeetwenReMarkers(
                data[i], re.compile('class="video-description[^>]+?>'),
                re.compile('</p>'), False)[1]
            if '' == desc:
                desc = self.cm.ph.getDataBeetwenReMarkers(
                    data[i], re.compile('class="yt-lockup-description[^>]+?>'),
                    re.compile('</div>'), False)[1]
            if desc != '': descTab.append(desc)

            newDescTab = []
            for desc in descTab:
                desc = CParsingHelper.cleanHtmlStr(desc)
                if desc != '':
                    newDescTab.append(desc)

            urlTmp = url.split(';')
            if len(urlTmp) > 0: url = urlTmp[0]
            if type == 'video': url = url.split('&')[0]
            #printDBG("#####################################")
            #printDBG('url   [%s] ' % url)
            #printDBG('title [%s] ' % title)
            #printDBG('img   [%s] ' % img)
            #printDBG('time  [%s] ' % time)
            #printDBG('desc  [%s] ' % desc)
            if title != '' and url != '' and img != '':
                correctUrlTab = [url, img]
                for i in range(len(correctUrlTab)):
                    if not correctUrlTab[i].startswith(
                            'http:') and not correctUrlTab[i].startswith(
                                'https:'):
                        if correctUrlTab[i].startswith("//"):
                            correctUrlTab[i] = 'http:' + correctUrlTab[i]
                        else:
                            correctUrlTab[
                                i] = 'http://www.youtube.com' + correctUrlTab[i]
                    #else:
                    #    if correctUrlTab[i].startswith('https:'):
                    #        correctUrlTab[i] = "http:" + correctUrlTab[i][6:]

                title = CParsingHelper.cleanHtmlStr(title)
                params = {
                    'type': urlPatterns[type][0],
                    'category': type,
                    'title': title,
                    'url': correctUrlTab[0],
                    'icon': correctUrlTab[1].replace('&amp;', '&'),
                    'time': time,
                    'desc': '[/br]'.join(newDescTab)
                }
                currList.append(params)

        return currList