def get_playlist(self, url): #url = "https://www.shuquge.com/txt/12236/index.html" base = os.path.dirname(url) hutf = self.get_hutf(url) #hutf = open("s.html").read().decode('utf8') #echo(hutf) tt = SelStr("div.book div.info h2", hutf) if not tt: return [] #echo(tt[0].text) title = tt[0].text echo(title) ul = SelStr("div.listmain dl", hutf) if not ul: return [] #for u in ul.descendants: sel = True lst = [] for u in ul[0].children: #echo(u) if u.tag == 'dt': sel = u"最新章节" not in u.text continue if sel and u.tag == 'dd': l = os.path.join(base, u.select("a")[0]['href']) echo(l, u.text) lst.append((u.text, l)) return lst
def query_info(self, url): hutf = self.get_hutf(url) debug(hutf) title = SelStr('title', hutf)[0].text k = None if title.endswith('.mp4'): title, k = title[:-4], 'mp4' #url = "https://www.rapidvideo.com/embed/FUZ35WDLM7" # https://www3731.playercdn.net/187/0/G4i-UJ6bQxIZI6FWc_F5dg/1536365722/180905/692FUZ37O792IXDCUZDFX.mp4 v = SelStr("video#videojs source", hutf) if v: u = v[0]["src"] return title, k, [u], None #url = 'https://www.rapidvideo.com/embed/ZsNSciBj' # https://admkis.playercdn.net/85/1/sQ52oTwwZ6vCo3Vk7-RS2g/1482741547/161202/063k10VmKldzoX8.mp4 hutf = self.get_hutf(url, postdata='block=1') data = match1(hutf, 'jwplayer\("home_video"\)\.setup\(([^\(\)]+)\);') debug(data) data = match1(data, '"sources":\s*(\[[^\[\]]+\])') ml, u = 0, '' for src in json.loads(data): l = src['label'] if l not in self.labels: echo("new label", l) i = self.labels.index(l) if i > ml: ml, u = i, src['file'] debug(title, u) return title, k, [u], None
def test(self, args): import time from urllib2 import urlopen from urllib import urlencode #url = "http://qdrama.org/k2/" hutf = self.get_hutf(args.url) #echo(hutf) title = SelStr("div.title.sizing h1", hutf)[0].text #echo("title =", title) nodes = SelStr("div#playsource a", hutf) cnt = 0 for node in nodes: cnt += 1 t = "%s_%02d" % (title.encode('utf8'), cnt) u = node['href'] if 'daily' not in u: continue echo(t, u) if cnt < 0: continue data = urlencode({"aviurl": u, "avitil": t, "destdn": "../dwm/xman/", "sub": "Start"}) urlopen("http://127.0.0.1:8080/", data).read() time.sleep(2)
def handle_sp_list(self, url): # serial play list urls = [] # http://www.bilibili.com/sp/维京传奇 # base.special.js line 25, loadBgmPage # http://www.bilibili.com/sppage/bangumi-21542-913-1.html # first find 21542 hutf = self.get_hutf(url) #echo(hutf) spid = search_first(hutf, 'var spid = "(\d+)";').group(1) echo("spid=", spid) for li in SelStr('ul#season_selector li', hutf): data = self.get_hutf("http://www.bilibili.com/sppage/bangumi-%s-%s-1.html" % ( spid, li['season_id'])) for n in SelStr('div.season_list li a.t', data): urls.append((n['title'].strip(), 'http://www.bilibili.com' + n['href'])) args = copy(self.parsed_args) sk = args.playlist_skip args.playlist_skip = -1315 tp = args.playlist_top args.playlist_top = 0 cnt = 0 for t, u in urls: cnt = cnt + 1 if cnt > tp > 0: break if cnt < sk: continue echo(t, u) b = BILIBILI() b.title = t args.url = u run(b, args) sys.exit(1)
def get_playlist(self, url): hutf = self.get_hutf(url) t = SelStr("h2.title a", hutf)[0] t = t.text.strip() ns = SelStr('div#playlist1 a', hutf) return [(t + "_" + a.text.strip(), "https://www.duboku.co" + a['href']) for a in ns]
def get_playlist(self, url): if 'dayi.ca/' in url: return [] hutf = self.get_hutf(url) #m = re.search(U("通用版.+第(\d+)集"), p[3].text) #if m: # max_id = int(m.group(1)) #else: # m = re.search(U("首播:.+共(\d+)集"), p[0].text) #, flags=re.M+re.U) # max_id = int(m.group(1)) t = SelStr("h1.entry-title", hutf)[0] m = re.search(U("(.+) .+第(\d+)集"), t.text) if m: title, max_id = m.group(1).strip(), int(m.group(2)) else: m = re.search(u"(.+) (\d+)集全", t.text) if m: title, max_id = m.group(1).strip(), int(m.group(2)) #ps = SelStr("div.entry-content p", hutf) #for a in p[1].select("a"): al = SelStr("div.entry-content p a", hutf) for a in al: uo = urlparse.urlparse(a['href']) qs = urlparse.parse_qs(uo.query) if 'p' in qs and 'page' in qs: pn = int(qs['p'][0]) break us = [(U("%s_第%02d集") % (title, i), "http://www.dayi.ca/ys/?p=%d&page=%d" % (pn, i)) for i in range(1, max_id + 1)] debug(us) return us
def test(self, argv): # try_m3u8 # echo(self.get_playlist('http://tv8.fun/20170328-人民的名义/')) # 'http://www.dayi.ca/ys/?p=3004&page=2' #url = 'http://www.dayi.ca/ys/?p=2386&page=52' #url = 'http://www.dayi.ca/ys/?p=3004&page=1' #url = 'http://www.dayi.ca/ys/?p=4076&&page=1' url = 'http://tv8.fun/%e4%b8%8a%e9%98%b3%e8%b5%8b/' # 上阳赋 url = 'http://tv8.fun/%e8%a5%bf%e4%ba%ac%e6%95%85%e4%ba%8b/' # 西京故事 hutf = self.get_hutf(url) echo(hutf) return t = SelStr("h1.entry-title", hutf)[0] m = re.search(u"(.+) 至第(\d+)集", t.text) echo(m.group(1), m.group(2)) p = SelStr("div.entry-content p", hutf) echo(p[3].text) m = re.search(u"通用版.+第(\d+)集", p[3].text) echo(m.group(1)) m = re.search(U("首播:.+共(\d+)集"), p[0].text) #, flags=re.M+re.U) echo(m.group(1)) for a in p[1].select("a"): if 'page=' not in a['href']: continue uo = urlparse.urlparse(a['href']) qs = urlparse.parse_qs(uo.query) echo(qs) break
def get_playlist(self, url): hutf = self.get_hutf(url) t = SelStr("div.video_title h2.title", hutf) title = "Unknown" if t: title = t[0].text ts = SelStr("div#playlistbox ul.content_playlist li a", hutf) return [(u"%s_%s" % (title, t.text), "https://www.olevod.com" + t['href']) for t in ts]
def query_info(self, url): #url = "https://www.shuquge.com/txt/12236/46252712.html" hutf = self.get_hutf(url) #echo(hutf) cts = SelStr("div.showtxt", hutf) #echo(ct.text) t = SelStr("div.content h1", hutf)[0] #return "", "mp4", us, None return t.text, "book", cts, 1
def query_info(self, url): #url = 'http://vmus.online/The-Outpost-S01EP01.html' hutf = self.get_hutf(url) #echo(hutf) ol = OpenLoad() ret = SelStr("title", hutf) ol.title = ret[0].text ret = SelStr("div.entry-content>p iframe", hutf) #url = [ret[0]['src']] return ol.query_info(ret[0]['src'])
def query_info(self, url): # http://8drama.com/122804/ #http://8drama.net/ipobar_.php?sign=251438... echo('phantomjs wait ...') p = Popen(["./phantomjs", "dwm.js", "300", url], stdout=PIPE) html = p.stdout.read() hutf = html.decode('utf8') p.wait() url = SelStr('video source', hutf)[0]['src'] title = SelStr('h1.entry-title', hutf)[0].text return title, None, [url], None
def query_info(self, url): hutf = self.get_hutf(url) #obj = match1(hutf, r" var\s+videoObject\s*\=\s*({[^}]+})") #mu = match1(obj, ' video:\s*(\S+)').strip('"') #mu = self.last_m3u8(mu) h = SelStr("h3", hutf)[0] d = SelStr("div.post-entry p", hutf)[0] mu = match1(d.text, ' video:\s*(\S+)').strip('"') mu = self.last_m3u8(mu) d.children = [c for c in d.children if isinstance(c, DataNode)] title = h.text.strip() + "_" + d.text.strip() return title, "m3u8", mu, None
def query_info(self, url): #url = 'http://www.ttwanda.com/films/us/1693.html?xf' hutf = self.get_hutf(url) if '?' not in url: a = SelStr('section.p5 div a', hutf)[0]['href'] url = url + a hutf = self.get_hutf(url) title = SelStr("div.video-content article p strong", hutf)[0].text r = "《(.+)》" if not py3: r = r.decode('utf8') t = match1(title, r) if t and '/films/' in url: title = t src = SelStr('iframe.player', hutf)[0]['src'] if '/player/v.php?url=' in src: # http://www.ttwanda.com/tv/ustv/945.html # ../../player/v.php?url=www.le.com/ptv/vplay/20723618.html src = 'http://' + src.split('?url=', 1)[1] from letv import LETV return LETV().query_info(src) if not src.startswith("http://") and not src.startswith("https://"): src = 'http://www.ttwanda.com/' + src echo(src) self.extra_headers['Referer'] = url # this is important hutf = self.get_hutf(src) dst = match1(hutf, 'var play_url \= "([^"]+)"') echo(dst) if not dst: echo("Can not find var play_url") sys.exit(1) if ('youku.com/' in dst and '/m3u8' in dst) \ or 'lecloud.com/' in dst \ or '/letv-uts/' in dst: return title, None, self.try_m3u8(dst), None if 'ttwanda.com/ftn_handler/' in dst: cs = ["%s=%s" % (c.name, c.value) for c in self.cookie.cookiejar if c.name != 'PHPSESSID'] echo(cs) self.wget_cookie = "; ".join(cs) k, s = get_kind_size(dst, self.wget_cookie) return title, k, [dst], s #if 'mgtv.com/' in dst or '189.cn/v5/downloadFile' in dst: # # http://www.ttwanda.com/films/us/907.html?style=cq # return title, None, [dst], None #echo('TTWanda has new source') #echo(dst) #sys.exit(1) return title, None, [dst], None
def query_info(self, url): hutf = self.get_hutf(url) #echo(hutf) title = SelStr("div.title.sizing h1", hutf)[0].text #echo("title =", title) nodes = SelStr("div#playsource a", hutf) urls = [] dm = DM() for node in nodes: t, e, us, s = dm.query_info(node['href']) echo(us) urls += us return title, None, urls, None
def test1(self, args): # http://m.bookdown.com.cn/read/31314.html url = 'http://m.bookdown.com.cn/read/31314.html' #hutf = self.get_hutf(url) #print hutf #m = re.findall("http://m.bookdown.com.cn/read/31314_\d+.html", hutf) #print m #hutf = self.get_html("http://m.bookdown.com.cn/read/31314_2.html") #print(hutf) #url = "http://m.bookdown.com.cn/read/31314_1.html" #url = "http://m.bookdown.com.cn/read/31314_1_2.html" while True: print >> sys.stderr, url hutf = self.get_hutf(url) # class="articlecon for div in SelStr('div.articlecon', hutf): #echo(div) #echo(" ".join(div.text.split(" "))) echo( re.sub(u"分节阅读.+,请点击下一页继续阅读。", "", re.sub(" ", " ", div.text))) #echo(hutf) m = re.findall( u'''<a class="btn" href="(http://m\.bookdown\.com\.cn/read/31314_.+\.html)">下一章</a>''', hutf) #m = re.findall(u'''\<a class="btn" href=".+"\>下一章\</a\>''', hutf) #, re.U) #echo(m) if not m: break url = m[0]
def query_info(self, url): # title, ext, urls, totalsize #url = "http://www.iqiyi.com/v_19rr26qr38.html" #url = "https://www.iqiyi.com/v_19rr04z9is.html?list=19rrm106om" #url = "https://www.iqiyi.com/v_19rr04z9is.html" hutf = self.get_hutf(url) for s in ('meta[name=irTitle]', 'meta[property=og:title]'): try: title = SelStr(s, hutf)[0]["content"] break except IndexError: title = self.title #echo(hutf) tvid = match1(hutf, """param\['tvid'\] = "(\d+)";""") vid = match1(hutf, """param\['vid'\] = "([^"]+)";""") echo("tvid=", tvid, ", vid=", vid) dat = I2().getVMS(tvid, vid) #echo(dat) vd, url = self.get_vd_url(dat) #title = "%s_vd%02d" % (title, vd) echo(title) #return hutf = self.get_hutf(url) us = self._get_m3u8_urls(url, hutf) if '.ts?' in us[0]: return title, "ts", us, None # title, ext, urls, totalsize return title, None, us, None
def query_info(self, url): hutf = self.chrome_hutf(url) #echo(hutf) title = SelStr("html head title", hutf)[0].text echo("title =", title) ret = SelStr("video#video_player", hutf) echo(ret) if ret: u = ret[0]["src"] return title, None, [u], None #return ci = get_ci(DEBUG) try: return self.query_info_chrome(ci, url) finally: print("ci.stop()") ci.stop()
def test(self, args): url = "https://www.rapidvideo.com/embed/FUZ35WDLM7" # https://www3731.playercdn.net/187/0/G4i-UJ6bQxIZI6FWc_F5dg/1536365722/180905/692FUZ37O792IXDCUZDFX.mp4 #echo(self.query_info(url)) hutf = self.get_hutf(url) #echo(hutf) d = SelStr("video#videojs source", hutf) u = d[0]["src"]
def get_playlist(self, url): hutf = self.get_hutf(url) #echo(hutf) urls = [] for a in SelStr('div.tvlists div.item a', hutf): if not a.select("span.sn_ispreview"): urls.append((a.text, a['href'])) return urls
def get_playlist1(self, url): # url = 'http://tv8.fun/20170328-人民的名义/' hutf = self.get_hutf(url) # echo(hutf) img = SelStr("div.entry-content p img", hutf) if img: title = img[0]['alt'] else: title = SelStr("title", hutf)[0].text echo(title) us = [] for p in SelStr("div.entry-content p", hutf): n = p.select("strong") if n and "M3U" in n[0].text: us = [(title + '_' + a.text, a['href']) for a in p.select("a")] echo(us) break return us
def query_info(self, url): hutf = self.get_hutf(url) dat = match1(hutf, r"var\s+player_data\s*\=\s*({[^}]+})") debug(dat) mu = self.last_m3u8(json.loads(dat)['url']) #us = self.try_m3u8(u) t = SelStr("h2.title", hutf)[0] title = '_'.join(t.text.split()) return title, "m3u8", mu, None
def test1(self, argv): url = 'http://vmus.online/the-outpost-s01.html' #hutf = self.chrome_hutf(url) hutf = self.get_hutf(url) #echo(hutf) ret = SelStr("a.fasc-button", hutf) #ret = [str(a) for a in ret] ret = [(0, a['href']) for a in ret] echo(ret)
def query_info1(self, url): # url = 'http://www.dayi.ca/ys/?p=2386&page=52' hutf = self.get_hutf(url) # echo(hutf) ct = SelStr("div#content-outer div#content", hutf)[0] title = ct.select('h3')[0].text p = ct.select('p')[0] title = title + '_' + p.text.split()[0] echo(title) #echo(p.text) u = match1(p.text, 'video:(\S+)') #u = u.strip('"').strip("'") if u[0] in ("'", '"'): u = u.split(u[0])[1] echo(u) #us = self.try_m3u8(u) #return title, None, us, None return title, "m3u8", u, None
def get_playlist(self, page_url): # http://www.letv.com/tv/10003313.html # http://www.le.com/tv/10009472.html urls = [] hutf = self.get_hutf(page_url) for a in SelStr('div.list.active > dl > dt > a', hutf): i = a.select("img")[0] if 'title' in i: urls.append((i['title'], a['href'])) return urls
def test(self, args): url = "http://www.bookdown.com.cn/bookinfo/30258.html" #url = "http://www.bookdown.com.cn/read/30258_1.html" ret = match1(url, "/bookinfo/(\d+)\.html", "/read/(\d+).*\.html") bid = int(ret[0]) echo("bid =", bid) url = "http://www.bookdown.com.cn/read/%d_1.html" % bid while True: #print >> sys.stderr, url hutf = self.get_hutf(url) #echo(hutf) for div in SelStr('div#view_content_txt', hutf): echo( re.sub(u"分节阅读.+,请点击下一页继续阅读。", "", re.sub(" ", " ", div.text))) al = SelStr("a#nextPage", hutf) if not al: break url = al[0]['href']
def get_playlist(self, page_url): #http://www.iqiyi.com/playlist521743802.html if '/playlist' in page_url: hutf = self.get_hutf(page_url) els = SelStr("div.site-piclist_pic > a.site-piclist_pic_link", hutf) return [(e['title'], e['href']) for e in els] # http://www.iqiyi.com/a_19rrhb9eet.html 太阳的后裔 echo("get_list phantomjs wait 200 ...") p = Popen(["./phantomjs", "dwm.js", "200", page_url], stdout=PIPE) html = p.stdout.read() p.wait() hutf = html.decode("utf8") #c = hutf.split("<!--视频列表区域 -->")[1] urls = [(a.text, a['href']) for a in SelStr('div.smalList > ul > li > a', hutf)] self.align_num = len(str(len(urls))) return urls
def query_info1(self, url): #url = 'https://www.dnvod.eu/Movie/Readyplay.aspx?id=deYM01Pf0bo%3d' hutf = self.get_hutf(url) title = SelStr('span#bfy_title >', hutf)[0].data.strip() debug('title =', title) for script in SelStr('script', hutf): txt = script.text debug('txt =', txt) if 'PlayerConfig' not in txt: continue debug('got PlayerConfig') vid = match1(txt, "id:\s*'([^']+)',") key = match1(txt, "key:\s*'([^']+)',") debug('vid =', vid, ', key =', key) break u = "https://www.dnvod.eu/Movie/GetResource.ashx?id=%s&type=htm" % vid self.extra_headers['Referer'] = url durl = self.get_html(u, postdata="key=" + key) debug(durl) return title, None, [durl], None
def download_one(self, url): #url = "https://shimo.im/docs/gJQufddR72AZJcna/read" hutf = self.get_hutf(url) #echo(hutf) #return #hutf = open("s.html").read() d = SelStr("div#editor", hutf)[0] t = d.select("div.ql-title div.ql-title-box")[0] #title = "_".join(t["data-value"].split('|')) + ".txt" title = t["data-value"] + ".txt" t = d.select("div.ql-editor")[0] for p in t.select("p"): #p.raw_text += "\n" if p.children and isinstance(p.children[-1], DataNode): p.children[-1].append("\n") else: p.children.append(DataNode(p, "\n")) #print t.text fout = open(title, "w") fout.write(t.text) fout.close()
def query_info1(self, url): # https://openload.io/embed/igdtpdeGltM/ # https://openload.co/embed/isCWWnlsZLE/ # https://openload.io/embed/biw7ytfelzU/ # <span id="streamurl">isCWWnlsZLE~1481138074~208.91.0.0~g617lYdo</span> echo("phantomjs wait 300 ...") p = Popen(["./phantomjs", "dwm.js", "300", url], stdout=PIPE) html = p.stdout.read() hutf = html.decode('utf8') p.wait() debug(hutf) n = SelStr('h6', hutf) if n: echo(n[0].text) return self.title, None, [], None #vid = match1(url, r'haiuken.com/theatre/([^/]+)/') m = re.search('''openload.co/embed/([^/]+)/''', url) if m: uid = m.groups()[0] echo(uid) m = re.search('''<span id="streamurl">([^<>]+)</span>''', hutf) vid = m.groups()[0] if not vid.startswith(uid): # TODO, try to decode it vid = uid + "~1497803146~64.180.0.0~eBodZDZa" echo(vid) url = "https://openload.co/stream/%s?mime=true" % vid # "https://openload.co/embed/kUEfGclsU9o/" n = SelStr("meta[name=og:title]", hutf) if n and self.title == UTITLE: self.title = n[0]['content'] # ="skyrim_no-audio_1080.mp4">" # https://openload.co/stream/isCWWnlsZLE~1481139117~208.91.0.0~mcLfSy5C?mime=true # video/mp4 584989307 k, tsize = get_kind_size(url) k = k.split('/')[-1] if self.title.endswith('.' + k): self.title = self.title[:-4] return self.title, k, [url], tsize
def query_info(self, url): #url = 'http://www.ttwanda.com/films/us/1693.html?xf' hutf = self.get_hutf(url) if '?' not in url: a = SelStr('section.p5 div a', hutf)[0]['href'] url = url + a hutf = self.get_hutf(url) title = SelStr("div.video-content article p strong", hutf)[0].text r = "《(.+)》" if not py3: r = r.decode('utf8') t = match1(title, r) if t and '/films/' in url: title = t src = SelStr('iframe.player', hutf)[0]['src'] if '/player/v.php?url=' in src: # http://www.ttwanda.com/tv/ustv/945.html # ../../player/v.php?url=www.le.com/ptv/vplay/20723618.html src = 'http://' + src.split('?url=', 1)[1] from letv import LETV return LETV().query_info(src) if not src.startswith("http://") and not src.startswith("https://"): src = 'http://www.ttwanda.com/' + src echo(src) self.extra_headers['Referer'] = url # this is important hutf = self.get_hutf(src) dst = match1(hutf, 'var play_url \= "([^"]+)"') echo(dst) if not dst: echo("Can not find var play_url") sys.exit(1) if ('youku.com/' in dst and '/m3u8' in dst) \ or 'lecloud.com/' in dst \ or '/letv-uts/' in dst: return title, None, self.try_m3u8(dst), None if 'ttwanda.com/ftn_handler/' in dst: cs = [ "%s=%s" % (c.name, c.value) for c in self.cookie.cookiejar if c.name != 'PHPSESSID' ] echo(cs) self.wget_cookie = "; ".join(cs) k, s = get_kind_size(dst, self.wget_cookie) return title, k, [dst], s #if 'mgtv.com/' in dst or '189.cn/v5/downloadFile' in dst: # # http://www.ttwanda.com/films/us/907.html?style=cq # return title, None, [dst], None #echo('TTWanda has new source') #echo(dst) #sys.exit(1) return title, None, [dst], None
def get_playlist(self, url): if '/tv/' not in url: return [] url = url.split('?')[0] hutf = self.get_hutf(url) ns = SelStr('div.article-paging a', hutf) # href="?vid=20723618&title=第01集 新局长崛起" urls = [] for a in ns: vid = match1(a['href'], 'vid=(\d+)') if vid: urls.append((a.text, url + '?vid=' + vid)) else: urls.append((a.text, url + a['href'])) return urls