コード例 #1
0
    async def parse(self, input_text, *k, **kk):
        html = await get_url_service.get_url_async(input_text)
        html = PyQuery(html)
        title = ""
        for meta in html('meta[itemprop="name"]'):
            meta = PyQuery(meta)
            title = meta.attr("content")
            break
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "list",
            "caption": "QQ视频全集"
        }
        for a in html(".mod_episode a"):
            a = PyQuery(a)
            _title = ""
            for span in PyQuery(a("span")):
                span = PyQuery(span)
                if span.attr("itemprop") == "episodeNumber":
                    _title = "第%s集" % span.text()
                elif span.has_class("mark_v"):
                    _title += span.children("img").attr("alt")
            info = {
                "name": _title,
                "no": _title,
                "subtitle": _title,
                "url": a.attr("href")
            }
            data["data"].append(info)
        data["total"] = len(data["data"])

        return data
コード例 #2
0
ファイル: qqlistparser.py プロジェクト: wwqgtxx/wwqLyParse
    async def parse(self, input_text, *k, **kk):
        html = await get_url_service.get_url_async(input_text)
        html = PyQuery(html)
        title = ""
        for meta in html('meta[itemprop="name"]'):
            meta = PyQuery(meta)
            title = meta.attr("content")
            break
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "list",
            "caption": "QQ视频全集"
        }
        for a in html(".mod_episode a"):
            a = PyQuery(a)
            _title = ""
            for span in PyQuery(a("span")):
                span = PyQuery(span)
                if span.attr("itemprop") == "episodeNumber":
                    _title = "第%s集" % span.text()
                elif span.has_class("mark_v"):
                    _title += span.children("img").attr("alt")
            info = {
                "name": _title,
                "no": _title,
                "subtitle": _title,
                "url": a.attr("href")
            }
            data["data"].append(info)
        data["total"] = len(data["data"])

        return data
コード例 #3
0
    def parse(self, input_text, *k, **kk):
        html = get_url(input_text)
        html = PyQuery(html)
        p_title = html("div.pl-title")
        title = p_title.attr("title")
        list_id = re.search(
            'https?://list.youku.com/albumlist/show/id_(\d+)\.html',
            input_text).group(1)
        ep = 'https://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=a'

        first_u = ep.format(list_id, 1)
        xhr_page = get_url(first_u)
        json_data = json.loads(xhr_page[14:-2])
        # print(json_data)
        # video_cnt = json_data['data']['total']
        xhr_html = json_data['html']
        # print(xhr_html)
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "collection",
            "caption": "优酷视频全集"
        }
        last_num = 1
        while True:
            new_url = ep.format(list_id, last_num)
            json_data = get_url(new_url)[14:-2]
            info = json.loads(json_data)
            if info.get("error", None) == 1 and info.get("message",
                                                         None) == "success":
                new_html = info.get("html", None)
                if new_html:
                    new_html = PyQuery(new_html)
                    items = new_html("a[target='video'][data-from='2-1']")
                    for item in items:
                        item = PyQuery(item)
                        url = "http:" + item.attr("href")
                        title = item.attr("title")
                        info = {
                            "name": title,
                            "no": title,
                            "subtitle": title,
                            "url": url
                        }
                        data["data"].append(info)
                    last_num += 1
                else:
                    break
            else:
                break
        data["total"] = len(data["data"])
        # print(data)

        return data
コード例 #4
0
ファイル: youkulistparser.py プロジェクト: wwqgtxx/wwqLyParse
    async def parse(self, input_text, *k, **kk):
        html = await get_url_service.get_url_async(input_text)
        html = PyQuery(html)
        p_title = html("div.pl-title")
        title = p_title.attr("title")
        list_id = re.search('https?://list.youku.com/albumlist/show/id_(\d+)\.html', input_text).group(1)
        ep = 'https://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=a'

        first_u = ep.format(list_id, 1)
        xhr_page = await get_url_service.get_url_async(first_u)
        json_data = json.loads(xhr_page[14:-2])
        # print(json_data)
        # video_cnt = json_data['data']['total']
        xhr_html = json_data['html']
        # print(xhr_html)
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "collection",
            "caption": "优酷视频全集"
        }
        last_num = 1
        while True:
            new_url = ep.format(list_id, last_num)
            json_data = await get_url_service.get_url_async(new_url)[14:-2]
            info = json.loads(json_data)
            if info.get("error", None) == 1 and info.get("message", None) == "success":
                new_html = info.get("html", None)
                if new_html:
                    new_html = PyQuery(new_html)
                    items = new_html("a[target='video'][data-from='2-1']")
                    for item in items:
                        item = PyQuery(item)
                        url = "http:" + item.attr("href")
                        title = item.attr("title")
                        info = {
                            "name": title,
                            "no": title,
                            "subtitle": title,
                            "url": url
                        }
                        data["data"].append(info)
                    last_num += 1
                else:
                    break
            else:
                break
        data["total"] = len(data["data"])
        # print(data)

        return data
コード例 #5
0
    def Parse(self, input_text):
        html = PyQuery(self.getUrl(input_text))
        items = html('a')
        title = html('title').text()
        i = 0
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": i,
            "type": "collection"
        }
        for item in items:
            a = PyQuery(item)
            name = a.attr('title')
            if name is None:
                name = a.text()
            no = name
            subtitle = name
            url = a.attr('href')
            if url is None:
                continue
            if name is None or name == "":
                continue
            if not re.match(
                    '(^(http|https)://.+\.(shtml|html))|(^(http|https)://.+/video/)',
                    url):
                continue
            if re.search(
                    '(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com)',
                    url):
                continue
            if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))', no):
                continue
            unsure = False

            info = {
                "name": name,
                "no": no,
                "subtitle": subtitle,
                "url": url,
                "unsure": unsure
            }
            data["data"].append(info)
            i = i + 1
        total = i
        data["total"] = total
        return data
コード例 #6
0
 def Parse(self, input_text):
     html2 = getUrl(input_text)
     html2 = PyQuery(html2)
     w120 = html2("div.gut > div.listTab > div.listPic > div.list > dl.w120 > dt > a")
     total = len(w120)
     title = html2("div.gut > div.listTab > div.listPic > div.tab:first-child > p.p1 > i").text()
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": total,
         "type": "list",
         "caption": "乐视视频全集"
     }
     for i in w120:
         i = PyQuery(i)
         url = i.attr("href")
         title = i("a > img").attr("title")
         info = {
             "name": title,
             "no": title,
             "subtitle": title,
             "url": url
         }
         data["data"].append(info)
     return data
コード例 #7
0
ファイル: jumpurlhandle.py プロジェクト: wluser/wwqLyParse
 def url_handle(self, input_text):
     html = get_url(input_text)
     html = PyQuery(html)
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     return url
コード例 #8
0
ファイル: lelistparser.py プロジェクト: erics8/wwqLyParse
 def parse(self, input_text, *k, **kk):
     html2 = get_url(input_text)
     html2 = PyQuery(html2)
     w120 = html2("div.gut > div.listTab > div.listPic > div.list > dl.w120 > dt > a")
     total = len(w120)
     title = html2("div.gut > div.listTab > div.listPic > div.tab:first-child > p.p1 > i").text()
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": total,
         "type": "list",
         "caption": "乐视视频全集"
     }
     for i in w120:
         i = PyQuery(i)
         url = i.attr("href")
         title = i("a > img").attr("title")
         info = {
             "name": title,
             "no": title,
             "subtitle": title,
             "url": url
         }
         data["data"].append(info)
     return data
コード例 #9
0
ファイル: iqiyilistparser.py プロジェクト: wwqgtxx/wwqLyParse
 async def parse(self, input_text, *k, **kk):
     if not await self._check_support(input_text):
         return []
     html_text = await get_url_service.get_url_async(input_text)
     html = PyQuery(html_text)
     title = html('h1.main_title > a').text()
     if not title:
         for a in html('div.crumb-item > a'):
             a = PyQuery(a)
             if a.attr('href') in input_text:
                 title = a.text()
     if not title:
         try:
             title = match1(html_text, '<title>([^<]+)').split('-')[0]
         except AttributeError:
             pass
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": 0,
         "type": "list",
         "caption": "271视频全集"
     }
     data["data"] = await self._get_list_info_api(html_text)
     return data
コード例 #10
0
ファイル: jumpurlhandle.py プロジェクト: v1-hermit/wwqLyParse
 def urlHandle(self,input_text):
     html = PyQuery(common.getUrl(input_text))
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     print('urlHandle:"'+input_text+'"-->"'+url+'"')
     return url
コード例 #11
0
ファイル: iqiyilistparser.py プロジェクト: wwqgtxx/wwqLyParse
 async def parse(self, input_text, *k, **kk):
     if not await self._check_support(input_text):
         return []
     html_text = await get_url_service.get_url_async(input_text)
     html = PyQuery(html_text)
     title = html('h1.main_title > a').text()
     if not title:
         for a in html('div.crumb-item > a'):
             a = PyQuery(a)
             if a.attr('href') in input_text:
                 title = a.text()
     if not title:
         try:
             title = match1(html_text, '<title>([^<]+)').split('-')[0]
         except AttributeError:
             pass
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": 0,
         "type": "list",
         "caption": "271视频全集"
     }
     data["data"] = await self._get_list_info_api(html_text)
     return data
コード例 #12
0
 def urlHandle(self, input_text):
     html = PyQuery(common.getUrl(input_text))
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     print('urlHandle:"' + input_text + '"-->"' + url + '"')
     return url
コード例 #13
0
ファイル: listparser.py プロジェクト: v1-hermit/wwqLyParse
 def get_list_info_html(html):
     #print("get_list_info_html")
     data = []
     album_items = html('ul.site-piclist').children('li')
     for album_item in album_items:
         album_item = PyQuery(album_item)
         site_piclist_info = PyQuery(album_item.children('div.site-piclist_info'))
         site_piclist_info_title = PyQuery(site_piclist_info.children('p.site-piclist_info_title'))
         site_piclist_info_title_a = PyQuery(site_piclist_info_title.children('a'))
         site_piclist_info_title_fs12 = PyQuery(site_piclist_info.children('p.fs12'))
         site_piclist_info_title_fs12_a = PyQuery(site_piclist_info_title_fs12.children('a'))
         no = site_piclist_info_title_a.text()
         #if re.search("预告",no):
             #continue
         name = site_piclist_info_title_fs12_a.text()
         url = site_piclist_info_title_fs12_a.attr('href')
         if url is None:
             continue
         subtitle = site_piclist_info_title_fs12_a.text()
         info = {
             "name": name,
             "no": no,
             "subtitle": subtitle,
             "url": url
         }
         data.append(info)
         i = i+1
     return data
コード例 #14
0
 def url_handle(self, input_text):
     html = PyQuery(get_url(input_text))
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     logging.debug('urlHandle:"' + input_text + '"-->"' + url + '"')
     return url
コード例 #15
0
ファイル: jumpurlhandle.py プロジェクト: erics8/wwqLyParse
 def url_handle(self, input_text):
     html = PyQuery(get_url(input_text))
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     logging.debug('urlHandle:"' + input_text + '"-->"' + url + '"')
     return url
コード例 #16
0
 async def url_handle(self, input_text):
     html = await get_url_service.get_url_async(input_text)
     html = PyQuery(html)
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     return url
コード例 #17
0
ファイル: indexparser.py プロジェクト: zsandianv/wwqLyParse
 def Parse_le(self, input_text):
     html = PyQuery(get_url(input_text))
     items = html('dt.d_tit')
     title = "LETV"
     i = 0
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": i,
         "type": "collection"
     }
     for item in items:
         a = PyQuery(item).children('a')
         name = a.text()
         no = a.text()
         subtitle = a.text()
         url = a.attr('href')
         if url is None:
             continue
         if not re.match('^http://www\.le\.com/.+\.html', url):
             continue
         info = {
             "name": name,
             "no": no,
             "subtitle": subtitle,
             "url": url,
             "caption": "首页地址列表"
         }
         data["data"].append(info)
         i = i + 1
     total = i
     data["total"] = total
     return data
コード例 #18
0
 def get_list_info_html(html):
     print("get_list_info_html")
     data = []
     album_items = html('ul.site-piclist').children('li')
     for album_item in album_items:
         album_item = PyQuery(album_item)
         site_piclist_info = PyQuery(
             album_item.children('div.site-piclist_info'))
         site_piclist_info_title = PyQuery(
             site_piclist_info.children('p.site-piclist_info_title'))
         site_piclist_info_title_a = PyQuery(
             site_piclist_info_title.children('a'))
         site_piclist_info_title_fs12 = PyQuery(
             site_piclist_info.children('p.fs12'))
         site_piclist_info_title_fs12_a = PyQuery(
             site_piclist_info_title_fs12.children('a'))
         no = site_piclist_info_title_a.text()
         #if re.search("预告",no):
         #continue
         name = site_piclist_info_title_fs12_a.text()
         url = site_piclist_info_title_fs12_a.attr('href')
         if url is None:
             continue
         subtitle = site_piclist_info_title_fs12_a.text()
         info = {
             "name": name,
             "no": no,
             "subtitle": subtitle,
             "url": url
         }
         data.append(info)
         i = i + 1
     return data
コード例 #19
0
ファイル: jumpurlhandle.py プロジェクト: wwqgtxx/wwqLyParse
 async def url_handle(self, input_text):
     html = await get_url_service.get_url_async(input_text)
     html = PyQuery(html)
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     return url
コード例 #20
0
ファイル: listparser.py プロジェクト: snow212-cn/wwqLyParse
 def Parse_v(self,input_text):
     print(input_text)
     html = PyQuery(common.getUrl(input_text))
     datainfo_navlist = PyQuery(html("#datainfo-navlist"))
     for a in datainfo_navlist.children('a'):
         a = PyQuery(a)
         url = a.attr("href")
         if re.search('www.iqiyi.com/(a_|lib/m)',url):
             return self.Parse(url)
コード例 #21
0
ファイル: anypageparser.py プロジェクト: road0001/wwqLyParse
	def Parse(self,input_text):
		html = PyQuery(self.getUrl(input_text))
		items = html('a')
		title = html('title').text()
		i =0
		data = {
			"data": [],
			"more": False,
			"title": title,
			"total": i,
			"type": "collection"
		}
		for item in items:
			a = PyQuery(item)
			name = a.attr('title')
			if name is None:
				name = a.text()
			no = name
			subtitle = name
			url = a.attr('href')
			if url is None:
				continue
			if name is None or name == "":
				continue
			if not re.match('(^(http|https)://.+\.(shtml|html))|(^(http|https)://.+/video/)',url):
				continue
			if re.search('(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com)',url):
				continue
			if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))',no):
				continue
			unsure = False
			
			info = {
				"name": name,
				"no": no,
				"subtitle": subtitle,
				"url": url,
				"unsure": unsure			
			}
			data["data"].append(info)
			i = i+1
		total = i
		data["total"] = total
		return data
コード例 #22
0
    def parse(self, input_text, *k, **kk):
        html = get_url(input_text)
        m = re.findall('showid:"([0-9]+)",', html)  # showid:"307775"
        if not m:
            return []
        logging.info(m[0])

        html = PyQuery(html)
        p_title = html("li.p-row.p-title")
        p_title("li>a").remove()
        p_title("li>span").remove()
        title = p_title.text().replace(":", '')

        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "list",
            "caption": "优酷视频全集"
        }
        last_num = 0
        while True:
            new_url = "https://list.youku.com/show/episode?id=" + m[
                0] + "&stage=reload_" + str(last_num) + "&callback=a"
            json_data = get_url(new_url)[14:-2]
            info = json.loads(json_data)
            if info.get("error", None) == 0 and info.get("message",
                                                         None) == "success":
                new_html = info.get("html", None)
                if new_html:
                    new_html = PyQuery(new_html)
                    items = new_html("a")
                    for item in items:
                        item = PyQuery(item)
                        num = int(item.text())
                        url = "http:" + item.attr("href")
                        title = "第%02d集" % num
                        info = {
                            "name": title,
                            "no": title,
                            "subtitle": title,
                            "url": url
                        }
                        data["data"].append(info)
                        last_num = num
                    last_num += 1
                else:
                    continue
            else:
                break
        data["total"] = len(data["data"])
        return data
コード例 #23
0
ファイル: youkulistparser.py プロジェクト: wwqgtxx/wwqLyParse
    async def parse(self, input_text, *k, **kk):
        html = await get_url_service.get_url_async(input_text)
        m = re.findall('showid:"([0-9]+)",', html)  # showid:"307775"
        if not m:
            return []
        logging.info(m[0])

        html = PyQuery(html)
        p_title = html("li.p-row.p-title")
        p_title("li>a").remove()
        p_title("li>span").remove()
        title = p_title.text().replace(":", '')

        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "list",
            "caption": "优酷视频全集"
        }
        last_num = 0
        while True:
            new_url = "https://list.youku.com/show/episode?id=" + m[0] + "&stage=reload_" + str(
                last_num) + "&callback=a"
            json_data = await get_url_service.get_url_async(new_url)
            info = json.loads(json_data[14:-2])
            if info.get("error", None) == 0 and info.get("message", None) == "success":
                new_html = info.get("html", None)
                if new_html:
                    new_html = PyQuery(new_html)
                    items = new_html("a")
                    for item in items:
                        item = PyQuery(item)
                        num = int(item.text())
                        url = "http:" + item.attr("href")
                        title = "第%02d集" % num
                        info = {
                            "name": title,
                            "no": title,
                            "subtitle": title,
                            "url": url
                        }
                        data["data"].append(info)
                        last_num = num
                    last_num += 1
                else:
                    continue
            else:
                break
        data["total"] = len(data["data"])
        return data
コード例 #24
0
 def parse(self, input_text, pool=pool_get_url, *k, **kk):
     logging.debug(input_text)
     html = PyQuery(get_url(input_text, pool=pool))
     datainfo_navlist = PyQuery(html(".progInfo_pic"))
     for a in datainfo_navlist.children('a'):
         a = PyQuery(a)
         url = a.attr("href")
         if str(url).startswith("//"):
             url = "http:" + str(url)
         logging.info("change %s to %s" % (input_text, url))
         result = get_main_parse()(input_text=url, types="list")
         if result:
             return result[0]
コード例 #25
0
 def Parse(self,input_text, pool=pool_getUrl):
     logging.debug(input_text)
     html = PyQuery(getUrl(input_text,pool = pool))
     datainfo_navlist = PyQuery(html("#datainfo-navlist"))
     for a in datainfo_navlist.children('a'):
         a = PyQuery(a)
         url = a.attr("href")
         logging.info("change %s to %s"%(input_text,url))
         try:
             from ..main import Parse as main_parse
         except Exception as e:
             from main import Parse as main_parse
         result = main_parse(input_text=url, types="list")
         if result:
             return result[0]
コード例 #26
0
ファイル: utils.py プロジェクト: ivanp/emailsopener
def serializeArray(form):
    form = PyQuery(form)
    if not form.is_('form'):
        return []

    source = form.find('input, select, textarea')

    data = []
    for input in source:
        input = PyQuery(input)
        if input.is_('[disabled]') or not input.is_('[name]'):
            continue
        if input.is_('[type=checkbox]') and not input.is_('[checked]'):
            continue

        data.append((input.attr('name'), input.val()))

    return data
コード例 #27
0
    def Parse_a(self, input_text):
        # modity from sceext2's list271.py
        def get_list_info_api1(html_text):
            RE_GET_AID = ' albumId: ([0-9]+),'  # albumId: 202340701,
            # http://cache.video.qiyi.com/jp/avlist/202340701/2/
            URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/avlist/'

            # get info from 271 javascript API port
            def get_info_from_js_port(html_text):
                # get album id
                aid = get_aid(html_text)
                # get info list
                vlist = get_vinfo_list(aid)
                # done
                return vlist

            # get album id
            def get_aid(html_text):
                m = re.findall(RE_GET_AID, html_text)
                return m[0]

            # make js API port URL
            def make_port_url(aid, page_n):
                url = URL_JS_API_PORT + str(aid) + '/' + str(page_n) + '/'
                #print(url)
                return url

            # get vinfo list, get full list from js API port
            def get_vinfo_list(aid):
                vlist = []
                # request each page
                page_n = 0
                while True:
                    # make request url
                    page_n += 1
                    url = make_port_url(aid, page_n)
                    # get text
                    raw_text = self.getUrl(url)
                    # get list
                    sub_list = parse_one_page(raw_text)
                    if len(sub_list) > 0:
                        vlist += sub_list
                    else:  # no more data
                        break
                # get full vinfo list done
                return vlist

            # parse one page info, parse raw info
            def parse_one_page(raw_text):
                # remove 'var tvInfoJs={' before json text, and json just ended with '}'
                json_text = '{' + raw_text.split('{', 1)[1]
                # load as json text
                info = json.loads(json_text)

                # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index
                if info['code'] == 'A00004':
                    return []  # just return null result

                # get and parse video info items
                vlist = info['data']['vlist']
                out = []  # output info
                for v in vlist:
                    one = {}

                    one['no'] = v['pd']
                    one['title'] = v['vn']
                    one['subtitle'] = v['vt']
                    one['url'] = v['vurl']

                    # get more info
                    one['vid'] = v['vid']
                    one['time_s'] = v['timeLength']
                    one['tvid'] = v['id']

                    out.append(one)
                # get video info done
                return out

            # get info from js API port
            info2 = get_info_from_js_port(html_text)
            # replace vlist with js port data
            vlist = []
            for i in info2:
                one = {}
                one['no'] = "第" + str(i['no']) + "集 " + str(i['subtitle'])
                one['subtitle'] = i['subtitle']
                one['url'] = i['url']
                vlist.append(one)
            # done
            return vlist

        def get_list_info_api2(html_text):
            RE_GET_AID = ' albumId: ([0-9]+),'  # albumId: 203342201,
            # http://cache.video.qiyi.com/jp/sdvlst/6/203342201/
            URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/sdvlst/6/'

            # get info from 271 javascript API port
            def get_info_from_js_port(html_text):
                # get album id
                aid = get_aid(html_text)
                # get info list
                vlist = get_vinfo_list(aid)
                # done
                return vlist

            # get album id
            def get_aid(html_text):
                m = re.findall(RE_GET_AID, html_text)
                return m[0]

            # make js API port URL
            def make_port_url(aid):
                url = URL_JS_API_PORT + str(aid) + '/'
                #print(url)
                return url

            # get vinfo list, get full list from js API port
            def get_vinfo_list(aid):
                vlist = []
                # make request url
                url = make_port_url(aid)
                # get text
                raw_text = self.getUrl(url)
                # get list
                vlist = parse_one_page(raw_text)
                # get full vinfo list done
                return vlist

            # parse one page info, parse raw info
            def parse_one_page(raw_text):
                # remove 'var tvInfoJs={' before json text, and json just ended with '}'
                json_text = '{' + raw_text.split('{', 1)[1]
                # load as json text
                info = json.loads(json_text)

                # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index
                if info['code'] == 'A00004':
                    return []  # just return null result

                # get and parse video info items
                vlist = info['data']
                out = []  # output info
                for v in vlist:
                    one = {}

                    one['no'] = v['desc']
                    one['title'] = v['desc']
                    one['subtitle'] = v['shortTitle']
                    one['url'] = v['vUrl']

                    # get more info
                    one['vid'] = v['vid']
                    one['time_s'] = v['timeLength']
                    one['tvid'] = v['tvId']

                    out.append(one)
                # get video info done
                return out

            # get info from js API port
            info2 = get_info_from_js_port(html_text)
            # replace vlist with js port data
            vlist = []
            for i in info2:
                one = {}
                one['no'] = i['no']
                one['subtitle'] = i['subtitle']
                one['url'] = i['url']
                vlist.append(one)
            # done
            return vlist

        def get_list_info_html(html):
            print("get_list_info_html")
            data = []
            album_items = html('ul.site-piclist').children('li')
            for album_item in album_items:
                album_item = PyQuery(album_item)
                site_piclist_info = PyQuery(
                    album_item.children('div.site-piclist_info'))
                site_piclist_info_title = PyQuery(
                    site_piclist_info.children('p.site-piclist_info_title'))
                site_piclist_info_title_a = PyQuery(
                    site_piclist_info_title.children('a'))
                site_piclist_info_title_fs12 = PyQuery(
                    site_piclist_info.children('p.fs12'))
                site_piclist_info_title_fs12_a = PyQuery(
                    site_piclist_info_title_fs12.children('a'))
                no = site_piclist_info_title_a.text()
                #if re.search("预告",no):
                #continue
                name = site_piclist_info_title_fs12_a.text()
                url = site_piclist_info_title_fs12_a.attr('href')
                if url is None:
                    continue
                subtitle = site_piclist_info_title_fs12_a.text()
                info = {
                    "name": name,
                    "no": no,
                    "subtitle": subtitle,
                    "url": url
                }
                data.append(info)
                i = i + 1
            return data

        html = PyQuery(self.getUrl(input_text))
        title = html('h1.main_title').children('a').text()
        for a in html('div.crumb-item').children('a'):
            a = PyQuery(a)
            if a.attr('href') in input_text:
                title = a.text()
        i = 0
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": i,
            "type": "list"
        }
        try:
            data["data"] = get_list_info_api1(self.getUrl(input_text))
        except Exception as e:
            print(e)
        if data["data"] == []:
            try:
                data["data"] = get_list_info_api2(self.getUrl(input_text))
            except Exception as e:
                import traceback
                traceback.print_exc()
                #print(e)
        if data["data"] == []:
            try:
                data["data"] = get_list_info_html(self.getUrl(input_text))
            except Exception as e:
                print(e)

        data["total"] = len(data["data"])

        return data
コード例 #28
0
ファイル: anypageparser.py プロジェクト: erics8/wwqLyParse
    def parse(self, input_text, *k, **kk):
        global TWICE_PARSE_TIMEOUT
        html = PyQuery(get_url(input_text))
        items = html('a')
        title = html('title').text()
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "collection"
        }
        urls = []
        for item in items:
            a = PyQuery(item)
            name = a.attr('title')
            if name is None:
                name = a.text()
            no = name
            subtitle = name
            url = a.attr('href')
            if url is None:
                continue
            if name is None or name == "":
                continue
            if re.match('^(http|https|ftp)://.+\.(mp4|mkv|ts|avi)', url):
                url = 'direct:' + url
            if not re.match('(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)', url):
                continue
            if re.search(
                    '[^\?](list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)',
                    url):
                continue
            if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))', no):
                continue
            unsure = False

            for temp in urls:
                if temp == str(url):
                    # print("remove:"+url)
                    url = None
                    break
            if url is None:
                continue

            urls.append(url)

            if re.search('(www.iqiyi.com/a_)|(www.le.com/comic)', url):
                unsure = True

            info = {
                "name": name,
                "no": no,
                "subtitle": subtitle,
                "url": url,
                "unsure": unsure
            }
            data["data"].append(info)
        if self.TWICE_PARSE:
            try:
                from .. import main
            except Exception as e:
                import main

            def runlist_parser(queue, url, pool):
                try:
                    result = main.parse(url, types="list", parsers_name=["iqiyilistparser.IQiYiAListParser",
                                                                         "iqiyilistparser.IQiYiLibMListParser",
                                                                         "iqiyilistparser.IQiYiVListParser"],
                                        pool=pool)[0]
                    if (result is not None) and (result != []) and (result["data"] is not None) and (
                                result["data"] != []):
                        queue.put({"result": result, "url": url})
                except IndexError:
                    pass
                except Exception as e:
                    # continue
                    logging.exception("twice parse %s failed" % url)
                    # import traceback
                    # traceback.print_exc()

            pool = WorkerPool(20)
            parser_threads = []
            parse_urls = []
            t_results = []
            q_results = Queue()
            with WorkerPool() as pool:
                for url in urls:
                    pool.spawn(runlist_parser, q_results, url, pool)
                pool.join(timeout=self.TWICE_PARSE_TIMEOUT)
            while not q_results.empty():
                t_results.append(q_results.get())

            oldddata = data["data"]
            data["data"] = []
            for t_result in t_results:
                parse_urls.append(t_result["url"])
                for tdata in t_result["result"]["data"]:
                    tdata["no"] = t_result["result"]["title"] + " " + tdata["no"]
                data["data"].extend(t_result["result"]["data"])
            for ddata in oldddata:
                if ddata["url"] not in parse_urls:
                    # print(ddata["url"])
                    data["data"].append(ddata)
        oldddata = data["data"]
        data["data"] = []
        parsed_urls = []
        for ddata in oldddata:
            if ddata["url"] not in parsed_urls:
                data["data"].append(ddata)
                parsed_urls.append(ddata["url"])
        data["total"] = len(data["data"])
        data["caption"] = "全页地址列表"
        return data
コード例 #29
0
ファイル: iqiyilistparser.py プロジェクト: wwqgtxx/wwqLyParse
 async def parse(self, input_text, *k, **kk):
     logging.debug(input_text)
     html = PyQuery(await get_url_service.get_url_async(input_text))
     url = ""
     # logging.debug(html)
     if not url:
         jss = html("script[type='text/javascript']")
         for item in jss:
             text = PyQuery(item).text()
             # logging.debug(text)
             if "Q.PageInfo.playPageData = {" in text or \
                     "Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo || {" in text:
                 split_text = text.replace("\r", ""). \
                                  replace("\n", ""). \
                                  replace("Q.PageInfo.playPageData = {", ""). \
                                  replace("window.Q = window.Q || {};", ""). \
                                  replace("var Q = window.Q; Q.PageInfo = Q.PageInfo || {};", ""). \
                                  replace("Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo ||", ""). \
                                  strip(). \
                                  replace("albumData:", ""). \
                                  strip()[:-1].strip()
                 logging.debug(split_text)
                 try:
                     data = json.loads(split_text)
                     print(json.dumps(data))
                     if "mixinVideos" in data and type(
                             data["mixinVideos"]) == list:
                         for item1 in data["mixinVideos"]:
                             if type(
                                     item1
                             ) == dict and 'crumbList' in item1 and type(
                                     item1['crumbList']) == list:
                                 for item2 in item1['crumbList']:
                                     if type(item2) == dict and 'level' in item2 and \
                                             item2['level'] == 3 and 'url' in item2:
                                         url = item2['url']
                                         if url and re.search(
                                                 r"www.iqiyi.com/v_", url):
                                             url = None
                             if url:
                                 logging.debug(url)
                                 break
                     elif "albumUrl" in data and data["albumUrl"]:
                         url = "http:" + data["albumUrl"]
                         logging.debug(url)
                         break
                 except json.JSONDecodeError:
                     logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         ld_json = html("script[type='application/ld+json']")
         for item in ld_json:
             text = PyQuery(item).text().replace("\n", "").replace("\r", "")
             try:
                 data = json.loads(text)
                 if "itemListElement" in data and type(
                         data["itemListElement"]) == list:
                     for item1 in data["itemListElement"]:
                         if type(item1) == dict and 'position' in item1 and \
                                 item1['position'] == 3 and 'item' in item1:
                             if type(item1['item']
                                     ) == dict and '@id' in item1['item']:
                                 url = item1['item']['@id']
                                 if url and re.search(
                                         r"www.iqiyi.com/v_", url):
                                     url = None
                     if url:
                         logging.debug(url)
                         break
             except json.JSONDecodeError:
                 logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         data_info_list = PyQuery(html("h2.playList-title-txt"))
         for a in data_info_list.children('a'):
             a = PyQuery(a)
             url = a.attr("href")
             if url:
                 logging.debug(url)
                 break
     if not url:
         a = PyQuery(html("a[data-albumurlkey]"))
         url = a.attr("href")
         logging.debug(url)
     if url and re.search(r"www.iqiyi.com/v_", url):
         url = None
     if url:
         if str(url).startswith("//"):
             url = "http:" + str(url)
         logging.info("change %s to %s" % (input_text, url))
         return ReCallMainParseFunc(input_text=url, types="list")
コード例 #30
0
def get_bbox(pq_obj: PyQuery) -> List:

    return json.loads(pq_obj.attr('bbox'))
コード例 #31
0
ファイル: anypageparser.py プロジェクト: v1-hermit/wwqLyParse
    def Parse(self,input_text,types=None):
        if (types is not None) and ("collection" not in types):
            return
        html = PyQuery(common.getUrl(input_text))
        items = html('a')
        title = html('title').text()
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "collection"
        }
        urls = []
        for item in items:
            a = PyQuery(item)
            name = a.attr('title')
            if name is None:
                name = a.text()
            no = name
            subtitle = name
            url = a.attr('href')
            if url is None:
                continue
            if name is None or name == "":
                continue    
            if re.match('^(http|https|ftp)://.+\.(mp4|mkv|ts|avi)',url):
                url = 'direct:'+url
            if not re.match('(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)',url):
                continue
            if re.search('(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)',url):
                continue
            if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))',no):
                continue
            unsure = False
                        
            for temp in urls:
                if temp == str(url):
                    #print("remove:"+url)
                    url = None
                    break
            if url is None:
                continue
            
            urls.append(url)

                    
            if re.search('(www.iqiyi.com/a_)|(www.le.com/comic)',url):
                unsure = True
                
            info = {
                "name": name,
                "no": no,
                "subtitle": subtitle,
                "url": url,
                "unsure": unsure           
            }
            data["data"].append(info)
        if self.TWICE_PARSE:
            try:
                from . import listparser
            except Exception as e:
                import listparser
            try:
                from .. import run
            except Exception as e:
                import run
            def runlist_parser(queue,parser,url):
                url2 = urlHandle(url)
                try:
                    result = parser.Parse(url2)
                    if (result is not None) and (result != []) and (result["data"] is not None) and (result["data"] != []):
                        queue.put({"result":result,"url":url})
                except Exception as e:
                    #continue
                    print(e)
                    #import traceback  
                    #traceback.print_exc() 
            list_parser = listparser.ListParser()
            urlHandle = run.urlHandle
            parser_threads = []
            parse_urls = []
            t_results = []
            q_results = queue.Queue()
            for url in urls:
                for filter in list_parser.getfilters():
                    if re.search(filter,url):
                        parser_threads.append(threading.Thread(target=runlist_parser, args=(q_results,list_parser,url)))
            for parser_thread in parser_threads:
                parser_thread.start()
            for parser_thread in parser_threads:
                parser_thread.join()
            while not q_results.empty():
                t_results.append(q_results.get())
                
            oldddata = data["data"]
            data["data"] = []
            for t_result in t_results:
                parse_urls.append(t_result["url"])
                for tdata in t_result["result"]["data"]:
                    tdata["no"] = t_result["result"]["title"] +" "+ tdata["no"]
                data["data"].extend(t_result["result"]["data"])
            for ddata in oldddata:
                if ddata["url"] not in parse_urls:
                    #print(ddata["url"])
                    data["data"].append(ddata)
        data["total"] = len(data["data"])
        data["caption"] = "全页地址列表"
        return data
コード例 #32
0
ファイル: iqiyilistparser.py プロジェクト: wwqgtxx/wwqLyParse
 async def parse(self, input_text, *k, **kk):
     logging.debug(input_text)
     html = PyQuery(await get_url_service.get_url_async(input_text))
     url = ""
     # logging.debug(html)
     if not url:
         jss = html("script[type='text/javascript']")
         for item in jss:
             text = PyQuery(item).text()
             # logging.debug(text)
             if "Q.PageInfo.playPageData = {" in text or \
                     "Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo || {" in text:
                 split_text = text.replace("\r", ""). \
                                  replace("\n", ""). \
                                  replace("Q.PageInfo.playPageData = {", ""). \
                                  replace("window.Q = window.Q || {};", ""). \
                                  replace("var Q = window.Q; Q.PageInfo = Q.PageInfo || {};", ""). \
                                  replace("Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo ||", ""). \
                                  strip(). \
                                  replace("albumData:", ""). \
                                  strip()[:-1].strip()
                 logging.debug(split_text)
                 try:
                     data = json.loads(split_text)
                     print(json.dumps(data))
                     if "mixinVideos" in data and type(data["mixinVideos"]) == list:
                         for item1 in data["mixinVideos"]:
                             if type(item1) == dict and 'crumbList' in item1 and type(item1['crumbList']) == list:
                                 for item2 in item1['crumbList']:
                                     if type(item2) == dict and 'level' in item2 and \
                                             item2['level'] == 3 and 'url' in item2:
                                         url = item2['url']
                                         if url and re.search(r"www.iqiyi.com/v_", url):
                                             url = None
                             if url:
                                 logging.debug(url)
                                 break
                     elif "albumUrl" in data and data["albumUrl"]:
                         url = "http:" + data["albumUrl"]
                         logging.debug(url)
                         break
                 except json.JSONDecodeError:
                     logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         ld_json = html("script[type='application/ld+json']")
         for item in ld_json:
             text = PyQuery(item).text().replace("\n", "").replace("\r", "")
             try:
                 data = json.loads(text)
                 if "itemListElement" in data and type(data["itemListElement"]) == list:
                     for item1 in data["itemListElement"]:
                         if type(item1) == dict and 'position' in item1 and \
                                 item1['position'] == 3 and 'item' in item1:
                             if type(item1['item']) == dict and '@id' in item1['item']:
                                 url = item1['item']['@id']
                                 if url and re.search(r"www.iqiyi.com/v_", url):
                                     url = None
                     if url:
                         logging.debug(url)
                         break
             except json.JSONDecodeError:
                 logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         data_info_list = PyQuery(html("h2.playList-title-txt"))
         for a in data_info_list.children('a'):
             a = PyQuery(a)
             url = a.attr("href")
             if url:
                 logging.debug(url)
                 break
     if not url:
         a = PyQuery(html("a[data-albumurlkey]"))
         url = a.attr("href")
         logging.debug(url)
     if url and re.search(r"www.iqiyi.com/v_", url):
         url = None
     if url:
         if str(url).startswith("//"):
             url = "http:" + str(url)
         logging.info("change %s to %s" % (input_text, url))
         return ReCallMainParseFunc(input_text=url, types="list")
コード例 #33
0
    def parse(self, input_text, *k, **kk):
        global TWICE_PARSE_TIMEOUT
        html = PyQuery(get_url(input_text))
        items = html('a')
        title = html('title').text()
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "collection"
        }
        urls = []
        for item in items:
            a = PyQuery(item)
            name = a.attr('title')
            if name is None:
                name = a.text()
            no = name
            subtitle = name
            url = a.attr('href')
            if url is None:
                continue
            if name is None or name == "":
                continue
            if re.match('^(http|https|ftp)://.+\.(mp4|mkv|ts|avi)', url):
                url = 'direct:' + url
            if not re.match(
                    '(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)',
                    url):
                continue
            if re.search(
                    '[^\?](list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)',
                    url):
                continue
            if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))', no):
                continue
            unsure = False

            for temp in urls:
                if temp == str(url):
                    # print("remove:"+url)
                    url = None
                    break
            if url is None:
                continue

            urls.append(url)

            if re.search('(www.iqiyi.com/a_)|(www.le.com/comic)', url):
                unsure = True

            info = {
                "name": name,
                "no": no,
                "subtitle": subtitle,
                "url": url,
                "unsure": unsure
            }
            data["data"].append(info)
        if self.TWICE_PARSE:
            try:
                from .. import main
            except Exception as e:
                import main

            def runlist_parser(queue, url, pool):
                try:
                    result = main.parse(
                        url,
                        types="list",
                        parsers_name=[
                            "iqiyilistparser.IQiYiAListParser",
                            "iqiyilistparser.IQiYiLibMListParser",
                            "iqiyilistparser.IQiYiVListParser"
                        ],
                        pool=pool)[0]
                    if (result is not None) and (result != []) and (
                            result["data"]
                            is not None) and (result["data"] != []):
                        queue.put({"result": result, "url": url})
                except IndexError:
                    pass
                except Exception as e:
                    # continue
                    logging.exception("twice parse %s failed" % url)
                    # import traceback
                    # traceback.print_exc()

            pool = WorkerPool(20)
            parser_threads = []
            parse_urls = []
            t_results = []
            q_results = Queue()
            with WorkerPool() as pool:
                for url in urls:
                    pool.spawn(runlist_parser, q_results, url, pool)
                pool.join(timeout=self.TWICE_PARSE_TIMEOUT)
            while not q_results.empty():
                t_results.append(q_results.get())

            oldddata = data["data"]
            data["data"] = []
            for t_result in t_results:
                parse_urls.append(t_result["url"])
                for tdata in t_result["result"]["data"]:
                    tdata[
                        "no"] = t_result["result"]["title"] + " " + tdata["no"]
                data["data"].extend(t_result["result"]["data"])
            for ddata in oldddata:
                if ddata["url"] not in parse_urls:
                    # print(ddata["url"])
                    data["data"].append(ddata)
        oldddata = data["data"]
        data["data"] = []
        parsed_urls = []
        for ddata in oldddata:
            if ddata["url"] not in parsed_urls:
                data["data"].append(ddata)
                parsed_urls.append(ddata["url"])
        data["total"] = len(data["data"])
        data["caption"] = "全页地址列表"
        return data
コード例 #34
0
 def parse(self, input_text, *k, **kk):
     logging.debug(input_text)
     html = PyQuery(get_url(input_text))
     url = ""
     if not url:
         jss = html("script[type='text/javascript']")
         for item in jss:
             text = PyQuery(item).text()
             if "Q.PageInfo.playPageData = {" in text:
                 split_text = text.replace("\r", ""). \
                                  replace("\n", ""). \
                                  replace("Q.PageInfo.playPageData = {", ""). \
                                  strip(). \
                                  replace("albumData:", ""). \
                                  strip()[:-1].strip()
                 logging.debug(split_text)
                 try:
                     data = json.loads(split_text)
                     print(json.dumps(data))
                     if "mixinVideos" in data and type(
                             data["mixinVideos"]) == list:
                         for item1 in data["mixinVideos"]:
                             if type(
                                     item1
                             ) == dict and 'crumbList' in item1 and type(
                                     item1['crumbList']) == list:
                                 for item2 in item1['crumbList']:
                                     if type(item2) == dict and 'level' in item2 and \
                                             item2['level'] == 3 and 'url' in item2:
                                         url = item2['url']
                                         if url:
                                             break
                             if url:
                                 break
                 except json.JSONDecodeError:
                     logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         ld_json = html("script[type='application/ld+json']")
         for item in ld_json:
             text = PyQuery(item).text().replace("\n", "").replace("\r", "")
             try:
                 data = json.loads(text)
                 if "itemListElement" in data and type(
                         data["itemListElement"]) == list:
                     for item1 in data["itemListElement"]:
                         if type(item1) == dict and 'position' in item1 and \
                                 item1['position'] == 3 and 'item' in item1:
                             if type(item1['item']
                                     ) == dict and '@id' in item1['item']:
                                 url = item1['item']['@id']
                     if url:
                         break
             except json.JSONDecodeError:
                 logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         data_info_list = PyQuery(html("h2.playList-title-txt"))
         for a in data_info_list.children('a'):
             a = PyQuery(a)
             url = a.attr("href")
             if url:
                 break
     if url:
         if str(url).startswith("//"):
             url = "http:" + str(url)
         logging.info("change %s to %s" % (input_text, url))
         result = get_main_parse()(input_text=url, types="list")
         if result:
             return result
コード例 #35
0
ファイル: listparser.py プロジェクト: v1-hermit/wwqLyParse
    def Parse_a(self,input_text):
        # modity from sceext2's list271.py
        def get_list_info_api1(html_text):
            RE_GET_AID = ' albumId: ([0-9]+),'    # albumId: 202340701,
            # http://cache.video.qiyi.com/jp/avlist/202340701/2/
            URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/avlist/'
            # get info from 271 javascript API port
            def get_info_from_js_port(html_text):
                # get album id
                aid = get_aid(html_text)
                # get info list
                vlist = get_vinfo_list(aid)
                # done
                return vlist

            # get album id
            def get_aid(html_text):
                m = re.findall(RE_GET_AID, html_text)
                return m[0]

            # make js API port URL
            def make_port_url(aid, page_n):
                url = URL_JS_API_PORT + str(aid) + '/' + str(page_n) + '/'
                #print(url)
                return url

            # get vinfo list, get full list from js API port
            def get_vinfo_list(aid):
                vlist = []
                # request each page
                page_n = 0
                urls = []
                while True:
                    # make request url
                    page_n += 1
                    url = make_port_url(aid, page_n)
                    # get text
                    raw_text = common.getUrl(url)
                    
                    # get list
                    sub_list = parse_one_page(raw_text)
                    for sub in sub_list:
                        url = sub['url']
                        if url in urls:
                            sub_list = []
                        else:
                            urls.append(url)
                    if len(sub_list) > 0:
                        vlist += sub_list
                    else:    # no more data
                        break
                # get full vinfo list done
                return vlist

            # parse one page info, parse raw info
            def parse_one_page(raw_text):
                # remove 'var tvInfoJs={' before json text, and json just ended with '}'
                json_text = '{' + raw_text.split('{', 1)[1]
                # load as json text
                info = json.loads(json_text)
                
                # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index
                if info['code'] == 'A00004':
                    return []    # just return null result
                
                # get and parse video info items
                vlist = info['data']['vlist']
                out = []    # output info
                for v in vlist:
                    one = {}
                    
                    one['no'] = v['pd']
                    one['title'] = v['vn']
                    one['subtitle'] = v['vt']
                    one['url'] = v['vurl']
                    
                    # get more info
                    one['vid'] = v['vid']
                    one['time_s'] = v['timeLength']
                    one['tvid'] = v['id']
                    
                    out.append(one)
                # get video info done
                return out
            # get info from js API port
            info2 = get_info_from_js_port(html_text)
            # replace vlist with js port data
            vlist = []
            for i in info2:
                one = {}
                one['no'] = "第"+str(i['no'])+"集 "+str(i['subtitle'])
                one['subtitle'] = i['subtitle']
                one['url'] = i['url']
                vlist.append(one)
            # done
            return vlist
        
        def get_list_info_api2(html_text):
            RE_GET_AID = ' albumId: ([0-9]+),'    # albumId: 203342201,
            # http://cache.video.qiyi.com/jp/sdvlst/6/203342201/
            URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/sdvlst/6/'
            # get info from 271 javascript API port
            def get_info_from_js_port(html_text):
                # get album id
                aid = get_aid(html_text)
                # get info list
                vlist = get_vinfo_list(aid)
                # done
                return vlist

            # get album id
            def get_aid(html_text):
                m = re.findall(RE_GET_AID, html_text)
                return m[0]

            # make js API port URL
            def make_port_url(aid):
                url = URL_JS_API_PORT + str(aid) + '/'
                #print(url)
                return url

            # get vinfo list, get full list from js API port
            def get_vinfo_list(aid):
                vlist = []
                # make request url
                url = make_port_url(aid)
                # get text
                raw_text = common.getUrl(url)
                # get list
                vlist = parse_one_page(raw_text)
                # get full vinfo list done
                return vlist

            # parse one page info, parse raw info
            def parse_one_page(raw_text):
                # remove 'var tvInfoJs={' before json text, and json just ended with '}'
                json_text = '{' + raw_text.split('{', 1)[1]
                # load as json text
                info = json.loads(json_text)
                
                # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index
                if info['code'] == 'A00004':
                    return []    # just return null result
                
                # get and parse video info items
                vlist = info['data']
                out = []    # output info
                for v in vlist:
                    one = {}
                    
                    one['no'] = v['desc']
                    one['title'] = v['desc']
                    one['subtitle'] = v['shortTitle']
                    one['url'] = v['vUrl']
                    
                    # get more info
                    one['vid'] = v['vid']
                    one['time_s'] = v['timeLength']
                    one['tvid'] = v['tvId']
                    
                    out.append(one)
                # get video info done
                return out
            # get info from js API port
            info2 = get_info_from_js_port(html_text)
            # replace vlist with js port data
            vlist = []
            for i in info2:
                one = {}
                one['no'] = i['no']
                one['subtitle'] = i['subtitle']
                one['url'] = i['url']
                vlist.append(one)
            # done
            return vlist
        
        def get_list_info_html(html):
            #print("get_list_info_html")
            data = []
            album_items = html('ul.site-piclist').children('li')
            for album_item in album_items:
                album_item = PyQuery(album_item)
                site_piclist_info = PyQuery(album_item.children('div.site-piclist_info'))
                site_piclist_info_title = PyQuery(site_piclist_info.children('p.site-piclist_info_title'))
                site_piclist_info_title_a = PyQuery(site_piclist_info_title.children('a'))
                site_piclist_info_title_fs12 = PyQuery(site_piclist_info.children('p.fs12'))
                site_piclist_info_title_fs12_a = PyQuery(site_piclist_info_title_fs12.children('a'))
                no = site_piclist_info_title_a.text()
                #if re.search("预告",no):
                    #continue
                name = site_piclist_info_title_fs12_a.text()
                url = site_piclist_info_title_fs12_a.attr('href')
                if url is None:
                    continue
                subtitle = site_piclist_info_title_fs12_a.text()
                info = {
                    "name": name,
                    "no": no,
                    "subtitle": subtitle,
                    "url": url
                }
                data.append(info)
                i = i+1
            return data
        #print("2"+input_text)
        def run(queue,get_list_info,html_text):
            try:
                result = get_list_info(html_text)
                if result != []:
                    queue.put(result)
            except Exception as e:
                #import traceback  
                #traceback.print_exc()  
                print(e)
        html_text = common.getUrl(input_text)
        html = PyQuery(html_text)
        title = html('h1.main_title').children('a').text()
        for a in html('div.crumb-item').children('a'):
            a = PyQuery(a)
            if a.attr('href') in input_text:
                title = a.text()    
        i =0
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": i,
            "type": "list",
            "caption": "271视频全集"
        }
        results = []
        parser_threads = []
        q_results = queue.Queue()
        parser_threads.append(threading.Thread(target=run, args=(q_results,get_list_info_api1,html_text)))
        parser_threads.append(threading.Thread(target=run, args=(q_results,get_list_info_api2,html_text)))
        for parser_thread in parser_threads:
            parser_thread.start()
        for parser_thread in parser_threads:
            parser_thread.join()
        while not q_results.empty():
            data["data"] =q_results.get()
            break
        if data["data"] == []:
            try:
                data["data"] = get_list_info_html(html)
            except Exception as e:
                #import traceback  
                #traceback.print_exc()  
                print(e)
            
        data["total"] = len(data["data"])
        
        return data
コード例 #36
0
    def parse(self, input_text, pool=pool_get_url, *k, **kk):
        # modity from sceext2's list271.py
        def get_list_info_api1(html_text):
            RE_GET_AID = ' albumId: ([0-9]+),'  # albumId: 202340701,
            # http://cache.video.qiyi.com/jp/avlist/202340701/2/
            URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/avlist/'

            # get info from 271 javascript API port
            def get_info_from_js_port(html_text):
                # get album id
                aid = get_aid(html_text)
                # get info list
                vlist = get_vinfo_list(aid)
                # done
                return vlist

            # get album id
            def get_aid(html_text):
                m = re.findall(RE_GET_AID, html_text)
                return m[0]

            # make js API port URL
            def make_port_url(aid, page_n):
                url = URL_JS_API_PORT + str(aid) + '/' + str(page_n) + '/'
                # print(url)
                return url

            # get vinfo list, get full list from js API port
            def get_vinfo_list(aid):
                vlist = []
                # request each page
                page_n = 0
                urls = []
                while True:
                    # make request url
                    page_n += 1
                    url = make_port_url(aid, page_n)
                    # get text
                    raw_text = get_url(url, pool=pool)

                    # get list
                    sub_list = parse_one_page(raw_text)
                    for sub in sub_list:
                        url = sub['url']
                        if url in urls:
                            sub_list = []
                        else:
                            urls.append(url)
                    if len(sub_list) > 0:
                        vlist += sub_list
                    else:  # no more data
                        break
                # get full vinfo list done
                return vlist

            # parse one page info, parse raw info
            def parse_one_page(raw_text):
                # remove 'var tvInfoJs={' before json text, and json just ended with '}'
                json_text = '{' + raw_text.split('{', 1)[1]
                # load as json text
                info = json.loads(json_text)

                # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index
                if info['code'] == 'A00004':
                    return []  # just return null result

                # get and parse video info items
                vlist = info['data']['vlist']
                out = []  # output info
                for v in vlist:
                    one = {}

                    one['no'] = v['pd']
                    one['title'] = v['vn']
                    one['subtitle'] = v['vt']
                    one['url'] = v['vurl']

                    # get more info
                    one['vid'] = v['vid']
                    one['time_s'] = v['timeLength']
                    one['tvid'] = v['id']

                    out.append(one)
                # get video info done
                return out

            # get info from js API port
            info2 = get_info_from_js_port(html_text)
            # replace vlist with js port data
            vlist = []
            for i in info2:
                one = {}
                one['no'] = "第" + str(i['no']) + "集 " + str(i['subtitle'])
                one['subtitle'] = i['subtitle']
                one['url'] = i['url']
                vlist.append(one)
            # done
            return vlist

        def get_list_info_api2(html_text):
            RE_GET_AID = ' albumId: ([0-9]+),'  # albumId: 203342201,
            # http://cache.video.qiyi.com/jp/sdvlst/6/203342201/
            URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/sdvlst/6/'

            # get info from 271 javascript API port
            def get_info_from_js_port(html_text):
                # get album id
                aid = get_aid(html_text)
                # get info list
                vlist = get_vinfo_list(aid)
                # done
                return vlist

            # get album id
            def get_aid(html_text):
                m = re.findall(RE_GET_AID, html_text)
                return m[0]

            # make js API port URL
            def make_port_url(aid):
                url = URL_JS_API_PORT + str(aid) + '/'
                # print(url)
                return url

            # get vinfo list, get full list from js API port
            def get_vinfo_list(aid):
                vlist = []
                # make request url
                url = make_port_url(aid)
                # get text
                raw_text = get_url(url, pool=pool)
                # get list
                vlist = parse_one_page(raw_text)
                # get full vinfo list done
                return vlist

            # parse one page info, parse raw info
            def parse_one_page(raw_text):
                # remove 'var tvInfoJs={' before json text, and json just ended with '}'
                json_text = '{' + raw_text.split('{', 1)[1]
                # load as json text
                info = json.loads(json_text)

                # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index
                if info['code'] == 'A00004':
                    return []  # just return null result

                # get and parse video info items
                vlist = info['data']
                out = []  # output info
                for v in vlist:
                    one = {}

                    one['no'] = v['desc']
                    one['title'] = v['desc']
                    one['subtitle'] = v['shortTitle']
                    one['url'] = v['vUrl']

                    # get more info
                    one['vid'] = v['vid']
                    one['time_s'] = v['timeLength']
                    one['tvid'] = v['tvId']

                    out.append(one)
                # get video info done
                return out

            # get info from js API port
            info2 = get_info_from_js_port(html_text)
            # replace vlist with js port data
            vlist = []
            for i in info2:
                one = {}
                one['no'] = i['no']
                one['subtitle'] = i['subtitle']
                one['url'] = i['url']
                vlist.append(one)
            # done
            return vlist

        def get_list_info_html(html):
            # print("get_list_info_html")
            data = []
            album_items = html('ul.site-piclist').children('li')
            for album_item in album_items:
                album_item = PyQuery(album_item)
                site_piclist_info = PyQuery(
                    album_item.children('div.site-piclist_info'))
                site_piclist_info_title = PyQuery(
                    site_piclist_info.children('p.site-piclist_info_title'))
                site_piclist_info_title_a = PyQuery(
                    site_piclist_info_title.children('a'))
                site_piclist_info_title_fs12 = PyQuery(
                    site_piclist_info.children('p.fs12'))
                site_piclist_info_title_fs12_a = PyQuery(
                    site_piclist_info_title_fs12.children('a'))
                no = site_piclist_info_title_a.text()
                # if re.search("预告",no):
                # continue
                name = site_piclist_info_title_fs12_a.text()
                url = site_piclist_info_title_fs12_a.attr('href')
                if url is None:
                    continue
                subtitle = site_piclist_info_title_fs12_a.text()
                info = {
                    "name": name,
                    "no": no,
                    "subtitle": subtitle,
                    "url": url
                }
                data.append(info)
                i = i + 1
            return data

        # print("2"+input_text)
        def run(queue, get_list_info, html_text):
            try:
                result = get_list_info(html_text)
                if result != []:
                    queue.put(result)
            except Exception as e:
                # import traceback
                # traceback.print_exc()
                logging.error(str(get_list_info) + str(e))

        html_text = get_url(input_text, pool=pool)
        html = PyQuery(html_text)
        title = html('h1.main_title > a').text()
        if not title:
            for a in html('div.crumb-item > a'):
                a = PyQuery(a)
                if a.attr('href') in input_text:
                    title = a.text()
        if not title:
            try:
                title = match1(html_text, '<title>([^<]+)').split('-')[0]
            except AttributeError:
                pass
        i = 0
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": i,
            "type": "list",
            "caption": "271视频全集"
        }
        results = []
        parser_threads = []
        q_results = queue.Queue()
        parser_threads.append(
            threading.Thread(target=run,
                             args=(q_results, get_list_info_api1, html_text)))
        parser_threads.append(
            threading.Thread(target=run,
                             args=(q_results, get_list_info_api2, html_text)))
        for parser_thread in parser_threads:
            parser_thread.start()
        for parser_thread in parser_threads:
            parser_thread.join()
        while not q_results.empty():
            data["data"] = q_results.get()
            break
        if not data["data"]:
            try:
                data["data"] = get_list_info_html(html)
            except Exception:
                # import traceback
                # traceback.print_exc()
                logging.exception(str(get_list_info_html))

        data["total"] = len(data["data"])
        return data