def getSource(url): if re.search(r'^http://vlog.xuite.net/play/', url): txt = xurl.load(url) m = re.search(r'http://m.xuite.net/vlog/([^"]*)', txt) if m: url = m.group(0) os.chdir(xdef.workdir) key = getKey(url) if key: txt = xurl.post(url, {'pwInput': key}) else: txt = xurl.load(url) m = re.search(r'data-original="([^"]*)"', txt) if m: print('\n[xuite][src]\n\n\t%s' %(m.group(1))) src_sd = m.group(1) src_hd = re.sub('q=360', 'q=720', m.group(1)) m3u = 'xuite.m3u' fd = open(m3u, 'w') fd.write(src_hd+'\n') fd.write(src_sd+'\n') fd.close() return '%s%s' %(xdef.workdir, m3u) return ''
def loadM3U8(url): txt = xurl.load(url) txt = txt.replace('\/', '/') txt = txt.replace('\\n', '\n') m = re.search(r'"m3u8":"([^"]*)"', txt) if m: m3u8 = m.group(1) local = xurl.genLocal(url, prefix='vod_list_', suffix='.m3u8') xurl.saveLocal(local, m3u8) return local results = [] for l in re.finditer(r'"l":"([^"]*)"', txt): part = l.group(1) if re.search(r'f4v\?', part): if part.startswith('http'): results.append(part) else: data_url = 'https://data.video.iqiyi.com/videos' + part for v in re.finditer(r'"l":"([^"]*)"', xurl.load(data_url)): results.append(v.group(1)) if len(results): local = xurl.genLocal(url, prefix='vod_list_', suffix='.m3u8') xurl.saveM3U8(local, results) return local return None
def extract(url): objs = [] basename = url.split('/')[-1] if len(basename) == 15: url_tv = 'https://www.pianku.tv/ajax/downurl/%s_tv/' % (basename[0:10]) local_cookie = xurl.genLocal(url, suffix='.cookie') opts = [] opts.append('-c %s' % (local_cookie)) xurl.load(url, opts=opts) opts = [] opts.append('-b %s' % (local_cookie)) opts.append('-H \'x-requested-with: XMLHttpRequest\'') opts.append('-H \'referer: %s\'' % (url)) txt = xurl.load(url_tv, opts=opts) for m in re.finditer(r'<li><a href="([^"]*)">(.*?)</a></li>', txt): link, title = urljoin(url, m.group(1)), m.group(2) objs.append(entryObj(link, title)) else: for m in re.finditer( r'<a href="(.*?)" title="(.*?)" target="_blank"><img src=".*?"\s+data-funlazy="(.*?)"', load(url)): link, title, img = urljoin(url, m.group(1)), m.group(2), urljoin( url, m.group(3)) objs.append(pageObj(link, title, img)) return objs
def extract_youtube_channels(url): objs = [] datas = [] datas.append(parseYoutubeInitialDataJSON(url)) txt = xurl.load(url) m1 = re.search(r'"INNERTUBE_CONTEXT_CLIENT_VERSION":"([^"]*)"', txt) m2 = re.search(r'"INNERTUBE_CONTEXT_CLIENT_NAME":(\w+)', txt) for m in re.finditer(r'"continuation":"([^"]*)"', txt): cont_url = 'https://www.youtube.com/browse_ajax?continuation=' + m.group( 1) opts = [] opts.append('-H \'x-youtube-client-version: %s\'' % (m1.group(1))) opts.append('-H \'x-youtube-client-name: %s\'' % (m2.group(1))) cont_txt = xurl.load(cont_url, opts=opts, ref=url) cont_data = json.loads(cont_txt) datas.append(cont_data) for data in datas: for x in findItem(data, ['gridChannelRenderer']): try: channelId = x['channelId'].encode('utf8') link = 'https://www.youtube.com/channel/' + channelId title = x['title']['simpleText'].encode('utf8') image = x['thumbnail']['thumbnails'][0]['url'].encode('utf8') objs.append(entryObj(link, title, image, 'Channel', False)) except: log('Exception:\n' + str(x)) return objs
def getSource(url, fmt, ref): local_json = youtubedl.extractURL(url, dontParseJson=True) local_m3u8 = re.sub('.json', '.m3u8', local_json) m = re.search(r'"manifest_url": "([^"]*)"', xurl.readLocal(local_json)) if m: manifest_url = m.group(1) xurl.load(manifest_url, local=local_m3u8) return local_m3u8 return None
def getSource(url, fmt, ref): if re.search(r'vod-play-id', url): try: txt = xurl.load(url) m = re.search(r'"url":"([^"]*)"', txt) url_m3u8 = m.group(1).replace('\\', '') txt_m3u8 = xurl.load(url_m3u8) m = re.search(r'(.*?\.m3u8)\s*', txt_m3u8) if m: return xurl.urljoin(url_m3u8, m.group(1)) else: return url_m3u8 except: print('Exception') return None
def index(req): req.content_type = 'text/html; charset=utf-8' form = req.form or util.FieldStorage(req) p = form.get('p', None) # page q = form.get('q', None) # query d = form.get('d', None) # dir s = form.get('s', None) # search x = form.get('x', None) # extra j = form.get('j', None) # json if j: xurl.init(logfile='vod-load-json.log') j = getUnparsedURL(req) or j req.write(xurl.load(j)) elif p: xurl.init(logfile='vod-page.log') p = getUnparsedURL(req) or p req.write(page.getPageJSON(p)) elif q: xurl.init(logfile='vod-page-search.log') req.write(page.getSearchJSON(q, s, x)) elif d: req.write(page.getDIRJSON(d)) return
def getSource(dataLink): url = 'http://play.wtutor.net/wp-admin/admin-ajax.php?action=ts-ajax&p=%s&n=1' %dataLink txt = xurl.load(url) videos = re.findall('file\s*:\s*[\"\']([^\"\']+).+?label\s*:\s*[\"\'](\d+)p[^\}]', txt) if videos: return videos[0][0] return None
def get_tracks(no, bno, args): url = 'https://histock.tw/stock/brokertrace.aspx?bno={b}&no={n}'.format( b=bno, n=no) url_opts = [] if args.cookies: url_opts.append('-H \'cookie: ' + args.cookies + '\'') local = xurl.genLocal(url, prefix='twstock_load_broker_') txt = xurl.load(url, local=local, opts=url_opts, cache=args.cache, cacheOnly=args.cacheOnly, verbose=args.verbose) vec = [] for m in re.finditer( r'<td>(.*?)</td><td>([\d|,]+)</td><td>(\d+[.]\d*)</td><td>([\d|,]+)</td><td>(\d+[.]\d*)</td><td>(\d+[.]\d*)</td>', txt): vec.insert( 0, track(m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6))) if len(vec) == 0 and re.search('alert', txt): os.remove(local) return vec
def get_exchange_rate_infos(data): if 'ExchangeRates' not in data: return [] url = 'https://rate.bot.com.tw/xrt/flcsv/0/day' txt = xurl.load(url, cache=False) infos = [] for exr in data['ExchangeRates']: c = exr['currency'] m = re.search(re.escape(c) + r',本行買入,([^,]*),([^,]*),.*?本行賣出,([^,]*),([^,]*),', txt) if m: info = exchange_rate_info(c, m.group(1), m.group(2), m.group(3), m.group(4)) info.flts = exr['flts'] info.flts_ret = [0] * len(info.flts) infos.append(info) # check the retured value of flts for info in infos: for i, f in enumerate(info.flts): try: m = re.search(r'(\w+)', f) vname = m.group(1) val = getattr(info, vname) cmd = f.replace(vname, val) info.flts_ret[i] = eval(cmd) except: pass return infos
def update_stock_report_overall(obj): url = 'https://fubon-ebrokerdj.fbs.com.tw/z/zc/zca/zca_%s.djhtm' %(obj.code) txt = xurl.load(url, encoding='big5') m = re.search(r'>收盤價</td>\s*<td class="t3n1">(.*)</td>', txt) if m: obj.pz_close = float(m.group(1).replace(',','')) m = re.search(r'>本益比</td>\s*<td class="t3n1">(.*)</td>', txt) if m and m.group(1) != 'N/A': obj.per = float(m.group(1).replace(',','')) m = re.search(r'>每股淨值\(元\)</td>\s*<td class="t3n1"><span class="t3n1">(.*?)</span></td>', txt) if m: obj.nav = float(m.group(1).replace(',','')) m = re.search(r'>年度</td>(.*?)</tr>', txt, re.MULTILINE | re.DOTALL) if m: obj.per_year = [int(x.replace(',','')) for x in re.findall(r'>([^<]+)</td>', m.group(1))] m = re.search(r'>最高本益比</td>(.*?)</tr>', txt, re.MULTILINE | re.DOTALL) if m: obj.per_max = [float(x.replace(',','')) if x != 'N/A' else 0 for x in re.findall(r'>([^<]+)</td>', m.group(1))] m = re.search(r'>最低本益比</td>(.*?)</tr>', txt, re.MULTILINE | re.DOTALL) if m: obj.per_min = [float(x.replace(',','')) if x != 'N/A' else 0 for x in re.findall(r'>([^<]+)</td>', m.group(1))] m = re.search(r'>股本\(億, 台幣\)</td>\s*<td class="t3n1">(.*)</td>', txt) if m: obj.capital_stock = float(m.group(1).replace(',','')) return
def autotest(): list_pass = [] list_fail = [] bookmarkJSONURL = 'https://gist.githubusercontent.com/JiasHuang/30f6cc0f78ee246c1e28bd537764d6c4/raw/bookmark.json' data = json.loads(xurl.load(bookmarkJSONURL)) for d in data['channels']: channel = d['channel'].encode('utf8') for x in d['links']: title = x['title'].encode('utf8') link = x['link'].encode('utf8') test = '[%s][%s] %s' % (channel, title, link) m = re.search(r'view.py\?(.*?)$', link) if m: q = re.search(r'q=([^&]*)', m.group(1)) q = q.group(1) if q else None s = re.search(r's=([^&]*)', m.group(1)) s = s.group(1) if s else None x = re.search(r'x=([^&]*)', m.group(1)) x = x.group(1) if x else None p = re.search(r'p=([^&]*)', m.group(1)) p = p.group(1) if p else None if q: entryCnt = len(extractors.search(q, s, x)) if p: entryCnt = len(extractors.extract(p)) else: entryCnt = len(extractors.extract(link)) if entryCnt <= 0: list_fail.append(test) else: list_pass.append(test) print('\n--- pass ---\n') print('\n'.join(list_pass)) print('\n--- fail ---\n') print('\n'.join(list_fail))
def getSource(url, fmt, ref): txt = xurl.load(url) for m in re.finditer(r'source src="([^"]*)"', txt): src = m.group(1) if not src.endswith('.m4a.m3u8'): print('[src] %s' % (src)) return src return url
def extract(url): if re.search(r'api.today.line.me', url): link = re.search(r'"720":"([^"]*)"', xurl.load(url, cache=False)) if link: return [entryObj(link.group(1))] else: programId = re.search(r'data-programId="([^"]*)"', xurl.load(url, cache=False)) if programId: link = 'https://api.today.line.me/webapi/linelive/' + programId.group( 1) return [pageObj(link)] else: return [ obj.to_page() for obj in findImageLink( url, ImageExt=None, ImagePattern=r'url\((.*?)\)') ] return None
def update_stock_report_dividend(obj): url = 'https://jdata.yuanta.com.tw/z/zc/zcc/zcc_%s.djhtm' %(obj.code) txt = xurl.load(url, encoding='big5') # 股利所屬年度, 現金股利(盈餘),現金股利(公積),現金股利(小計),股票股利(盈餘),股票股利(公積),股票股利(小計) for m in re.finditer(r'<td class="t3n0">(.*?)</tr>', txt, re.MULTILINE | re.DOTALL): m2 = re.findall(r'>([^<]+)</td>', m.group(0)) if len(m2) == 9: obj.dividend.append(dividend_info(Y=m2[0], cash_a=m2[1], cash_b=m2[2], stock_a=m2[4], stock_b=m2[5])) if len(obj.dividend) >= 5: break return
def update_stock_report_news(obj): for i in range(1, 3): url = 'https://jdata.yuanta.com.tw/Z/ZC/ZCV/ZCV_%s_E_%d.djhtm' %(obj.code, i) txt = xurl.load(url, encoding='big5') for m in re.finditer(r'<tr><td class="t3t1">([^<]*)</td>\s*<td class="t3t1"><a href="([^"]*)">([^<]*)</a>', txt): date = m.group(1) link = 'https://jdata.yuanta.com.tw' + m.group(2) title = m.group(3) if re.search(r'(每股稅後|每股盈餘|EPS|法說)', title): obj.news.append(news_info(date, title, link)) return
def get_stat_vol(code, cacheOnly): obj = {} url = 'https://jdata.yuanta.com.tw/z/zc/zcw/zcwg_%s.djhtm' %(code) txt = xurl.load(url, cacheOnly=cacheOnly, expiration=432000, encoding='big5') m = re.search(r'GetBcdData\(\'([^ ]*) ([^\']*)\'', txt) if m: vols = m.group(2).split(',') total_v = 0 for i in range(len(vols)): v = int(vols[i]) total_v = total_v + v obj['30d_vol'] = total_v / 30 return obj
def update_stock_report_revenue(obj): now = datetime.datetime.now() from_year = int(now.year) - 1911 - defs.from_year_offset url = 'https://jdata.yuanta.com.tw/z/zc/zch/zch_%s.djhtm' %(obj.code) txt = xurl.load(url, encoding='big5') for m in re.finditer(r'<td class="t3n0">(\d+)/(\d+)</td>(.*?)</tr>', txt, re.MULTILINE | re.DOTALL): Y, M = m.group(1), m.group(2) if int(Y) < from_year: break m2 = re.findall(r'>([^<]+)</td>', m.group(3)) if len(m2) > 0: obj.revenue.insert(0, revenue_info(Y, M, m2[0].replace(',',''))) return
def update_stock_report_eps(obj): now = datetime.datetime.now() from_year = int(now.year) - 1911 - defs.from_year_offset url = 'https://fubon-ebrokerdj.fbs.com.tw/z/zc/zce/zce_%s.djhtm' %(obj.code) txt = xurl.load(url, encoding='big5') # 季別,0營業收入,1營業成本,2營業毛利,3毛利率,4營業利益,5營益率,6業外收支,7稅前淨利,8稅後淨利,9EPS(元) for m in re.finditer(r'<td class="t3n0">(\d+)\.(\d)Q(.*?)</tr>', txt, re.MULTILINE | re.DOTALL): Y, Q = m.group(1), m.group(2) if int(Y) < from_year: break m2 = re.findall(r'>([^\n<]*)<', m.group(3)) if len(m2) == 10: obj.eps.insert(0, eps_info(Y, Q, rev=m2[0], profit=m2[4], nor=m2[6], ni=m2[8], eps=m2[9])) return
def update_stock_report_wap_otc(obj): now = datetime.datetime.now() for year in range(now.year - defs.from_year_offset, now.year + 1): url = 'https://www.tpex.org.tw/web/stock/statistics/monthly/download_st44.php?l=zh-tw' txt = xurl.load(url, opts=['--data-raw \'yy=%s&stk_no=%s\'' %(year, obj.code)]) # 年度,月份,收市最高價,收市最低價,收市平均價,成交筆數,成交金額仟元(A),成交股數仟股(B),週轉率(%), for m in re.finditer(r'"(\d+)","(\d+)","(.*?)","(.*?)","(.*?)",".*?","(.*?)","(.*?)",', txt): Y, M = m.group(1), m.group(2) h, l, = m.group(3), m.group(4) A = m.group(6).replace(',','') B = m.group(7).replace(',','') a = '%.2f' %(float(A) / float(B)) obj.wap.append(wap_info(Y, M, h, l, a, A + '000', B + '000')) return
def get_stock_infos(data): infos = [] ex_ch = '|'.join([get_ex_ch_by_code(s['code']) for s in data['stocks']]) url = 'https://mis.twse.com.tw/stock/api/getStockInfo.jsp?ex_ch=%s&json=1&delay=0' %(ex_ch) txt = xurl.load(url, cache=False) twse_data = json.loads(txt) if 'msgArray' not in twse_data: return [] for msg in twse_data['msgArray']: for s in data['stocks']: if s['code'] == msg['c']: info = stock_info(s['code'], s.get('flts'), s.get('tags'), s.get('notes')) info.msg = msg parse_info(info) infos.append(info) return infos
def findYouTubeNextPage(url, q): objs = [] local = xurl.genLocal(url, suffix='.old') txt = xurl.load(url, local, opts=['--cookie \"PREF=f1=50000000;f6=1408;f5=30;hl=en\"']) pages = re.search(r'search-pager(.*?)</div>', txt, re.DOTALL | re.MULTILINE) if pages: for m in re.finditer(r'<(a|button) .*?</(a|button)>', pages.group(1)): label = re.search(r'<span.*?">(.*?)</span>', m.group()) label = label.group(1) if label else None link = None if m.group(1) == 'a': href = re.search(r'href="([^"]*)"', m.group()) link = urljoin(url, href.group(1)) if href else None objs.append(navObj(label, link)) return objs
def update_stock_report_wap(obj): now = datetime.datetime.now() for year in range(now.year - defs.from_year_offset, now.year + 1): url = 'https://www.twse.com.tw/exchangeReport/FMSRFK?response=json&stockNo=%s&date=%4d0101' %(obj.code, year) txt = xurl.load(url) try: data = json.loads(txt) except: continue if 'data' not in data: continue # 年度,月份,最高價,最低價,加權(A/B)平均價,成交筆數,成交金額(A),成交股數(B),週轉率(%), for d in data['data']: Y, M = d[0], d[1] h, l, a, = d[2], d[3], d[4] A = d[6].replace(',','') B = d[7].replace(',','') obj.wap.append(wap_info(Y, M, h, l, a, A, B)) return
def getSource(url, fmt, ref): if re.search(r'vod-play-id', url): try: m = re.search(r'num-(\d+)', url) ep_num = m.group(1) txt = xurl.load(url) m = re.search(r'base64decode\(\'([^\']*)', txt) code = m.group(1) decoded = xurl.unquote(base64.b64decode(code)) print('\n[pangzitv][DBG][decoded]\n\n\t%s' % (decoded)) # process unicode special character decoded = decoded.replace('%u', '\\u').decode('unicode_escape') urls = [] for m in re.finditer(r'http[^#$\n]*', decoded): urls.append(m.group()) if len(urls) >= int(ep_num): return urls[int(ep_num) - 1] return urls[0] except: print('Exception') return None
def getSource(url, fmt, ref): txt = xurl.load(url) video = [] audio = [] video_ids = ['64', '32', '16'] audio_ids = ['30280', '30216'] video_id = None audio_id = None for m in re.finditer(r'"id":(\d+),"baseUrl":"([^"]*)"', txt): _id, _url = m.group(1), m.group(2) if _id in video_ids: if not video_id: video_id = _id if _id == video_id: video.append(_url) if _id in audio_ids: if not audio_id: audio_id = _id if _id == audio_id: audio.append(_url) local_a = xurl.genLocal(url, prefix='vod_list_', suffix='.audio.m3u8') local_v = xurl.genLocal(url, prefix='vod_list_', suffix='.video.m3u8') local = xurl.genLocal(url, prefix='vod_list_', suffix='.m3u8') xurl.saveM3U8(local_a, audio) xurl.saveM3U8(local_v, video) s = [] s.append('#EXTM3U') s.append( '#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio",LANGUAGE="eng",URI="%s"' % (local_a)) s.append('#EXT-X-STREAM-INF:AUDIO="audio"') s.append(local_v) xurl.saveLocal(local, '\n'.join(s)) return local
def getSource(url, fmt, ref): txt = xurl.load(url) m = re.search(r'geturl\(\'(.*?)\'\)', txt) return m.group(1) if m else None
def loadLocal(url): return xurl.load('http://127.0.0.1/vod/' + url)
def getSource(url): txt = xurl.load(url) m = re.search(r'source src="([^"]*)"', txt) if m: return m.group(1) return url
def dl(url, local, ref=None, read=True): if not os.path.exists(local): xurl.load(url, local, ref=ref) if read: return xurl.readLocal(local) return None
def getSource(url, fmt, ref): txt = xurl.load(url) m = re.search('<source.*? src="([^"]*)"', txt) return m.group(1) if m else None
def get_date_from_bshtm(): url = 'https://bsr.twse.com.tw/bshtm/bsWelcome.aspx' txt = xurl.load(url) m = re.search(r'<span id="Label_Date">(\d+)/(\d+)/(\d+)</span>', txt) return int(m.group(1) + m.group(2) + m.group(3))
def gen_tse(): url = 'https://isin.twse.com.tw/isin/C_public.jsp?strMode=2' txt = xurl.load(url) m = re.findall(r'<tr><td bgcolor=#FAFAD2>(\w+)', txt) xurl.saveLocal('tse-code-list.txt', '\n'.join(m)) return
def load(url): return xurl.load(url)