def cleanHtmlStr(str): return CParsingHelper.cleanHtmlStr(str)
def parseListBase(self, data, type='video'): printDBG("parseListBase----------------") urlPatterns = { 'video': ['video', 'href="[ ]*?(/watch\?v=[^"]+?)"', ''], 'channel': ['category', 'href="(/[^"]+?)"', ''], 'playlist': ['category', 'list=([^"]+?)"', '/playlist?list='], 'movie': ['video', 'data-context-item-id="([^"]+?)"', '/watch?v='], 'live': ['video', 'href="(/watch\?v=[^"]+?)"', ''], 'tray': ['video', 'data-video-id="([^"]+?)"', '/watch?v='], } currList = [] for i in range(len(data)): #printDBG("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") # get requaired params url = urlPatterns[type][2] + self.getAttributes( urlPatterns[type][1], data[i]) # get title title = '' #self.getAttributes('title="([^"]+?)"', data[i]) if '' == title: title = self.getAttributes( 'data-context-item-title="([^"]+?)"', data[i]) if '' == title: title = self.getAttributes('data-video-title="([^"]+?)"', data[i]) if '' == title: sts, title = CParsingHelper.getDataBeetwenMarkers( data[i], '<h3 class="yt-lockup-title">', '</h3>', False) if '' == title: sts, title = CParsingHelper.getDataBeetwenReMarkers( data[i], re.compile('<span [^>]*?class="title[^>]*?>'), re.compile('</span>'), False) if '' == title: sts, title = CParsingHelper.getDataBeetwenReMarkers( data[i], re.compile('class="pl-video-title-link[^>]*?>'), re.compile('<'), False) if '' == title: titleMarker = self.cm.ph.getSearchGroups( data[i], '(<[^">]+?"yt-lockup-title[^"]*?"[^>]*?>)')[0] if '' != titleMarker: tidx = titleMarker.find(' ') if tidx > 0: tmarker = titleMarker[1:tidx] title = self.cm.ph.getDataBeetwenMarkers( data[i], titleMarker, '</%s>' % tmarker)[1] if '' != title: title = CParsingHelper.cleanHtmlStr(title) if i == 0: printDBG(data[i]) img = self.getAttributes('data-thumb="([^"]+?\.jpg[^"]*?)"', data[i]) if '' == img: img = self.getAttributes('src="([^"]+?\.jpg[^"]*?)"', data[i]) if '' == img: img = self.getAttributes('<img[^>]+?data\-thumb="([^"]+?)"', data[i]) if '' == img: img = self.getAttributes('<img[^>]+?src="([^"]+?)"', data[i]) if '.gif' in img: img = '' time = self.getAttributes('data-context-item-time="([^"]+?)"', data[i]) if '' == time: time = self.getAttributes('class="video-time">([^<]+?)</span>', data[i]) if '' == time: sts, time = CParsingHelper.getDataBeetwenReMarkers( data[i], re.compile('pl-video-time"[^>]*?>'), re.compile('<'), False) if '' == time: sts, time = CParsingHelper.getDataBeetwenReMarkers( data[i], re.compile('timestamp"[^>]*?>'), re.compile('<'), False) time = time.strip() # desc descTab = [] desc = self.cm.ph.getDataBeetwenMarkers( data[i], '<div class="yt-lockup-meta', '</div>')[1] if desc != '': descTab.append(desc) desc = self.cm.ph.getDataBeetwenMarkers( data[i], '<span class="formatted-video-count', '</span>')[1] if desc != '': descTab.append(desc) desc = self.cm.ph.getDataBeetwenReMarkers( data[i], re.compile('class="video-description[^>]+?>'), re.compile('</p>'), False)[1] if '' == desc: desc = self.cm.ph.getDataBeetwenReMarkers( data[i], re.compile('class="yt-lockup-description[^>]+?>'), re.compile('</div>'), False)[1] if desc != '': descTab.append(desc) newDescTab = [] for desc in descTab: desc = CParsingHelper.cleanHtmlStr(desc) if desc != '': newDescTab.append(desc) urlTmp = url.split(';') if len(urlTmp) > 0: url = urlTmp[0] if type == 'video': url = url.split('&')[0] #printDBG("#####################################") #printDBG('url [%s] ' % url) #printDBG('title [%s] ' % title) #printDBG('img [%s] ' % img) #printDBG('time [%s] ' % time) #printDBG('desc [%s] ' % desc) if title != '' and url != '' and img != '': correctUrlTab = [url, img] for i in range(len(correctUrlTab)): if not correctUrlTab[i].startswith( 'http:') and not correctUrlTab[i].startswith( 'https:'): if correctUrlTab[i].startswith("//"): correctUrlTab[i] = 'http:' + correctUrlTab[i] else: correctUrlTab[ i] = 'http://www.youtube.com' + correctUrlTab[i] #else: # if correctUrlTab[i].startswith('https:'): # correctUrlTab[i] = "http:" + correctUrlTab[i][6:] title = CParsingHelper.cleanHtmlStr(title) params = { 'type': urlPatterns[type][0], 'category': type, 'title': title, 'url': correctUrlTab[0], 'icon': correctUrlTab[1].replace('&', '&'), 'time': time, 'desc': '[/br]'.join(newDescTab) } currList.append(params) return currList