def get_real_url(self): for i, play_page_url in enumerate(self.play_page_urls): # logging.info('[当前资源第:] %s/%s [项]', i + 1, self.num) play_page_url = 'https://www.yszxwang.com' + play_page_url # logging.info('[播放页链接:] %s', play_page_url) resp1 = get_response(play_page_url) if resp1: self.data_url = re.search('var now="(http.*?)"', resp1).group(1).strip() # logging.info('[数据链接:] %s', self.data_url) resp2 = get_response(self.data_url) if resp2: u2 = '' if 'm3u8' in self.data_url: if self.type == 'w': break host = re.sub('index.*', '', self.data_url) resp3 = get_response(self.data_url) if resp3: m3u8text = resp3.split('\n') for text in m3u8text: if 'm3u8' in text: u2 = text else: s = self.data_url.split('/') host1 = s[0] + '//' + s[2] u1 = re.search('var main = "(.*?)"', resp2).group(1).strip() m3u8_url1 = host1 + u1 # logging.info('[第一个m3u8:] %s', m3u8_url1) host = re.sub('index.*', '', m3u8_url1) # 读取第一个m3u8链接,获取真实m3u8链接 resp3 = get_response(m3u8_url1) if resp3: m3u8text = resp3.split('\n') for text in m3u8text: if 'm3u8' in text: u2 = text if u2: if u2[0] == '/': real_url = host + u2[1:] else: real_url = host + u2 # logging.info('[真实m3u8:] %s', real_url) resp = get_response(real_url) if resp: return real_url logging.info('[播放页异常:] %s', play_page_url) return '全部资源尝试完毕,解析失败'
def get_all_source(self): max_num = 0 max_play = 0 resp = get_response(self.many_detail_url) all_source = re.findall("href='(/video.*?)'", resp) for s in all_source: num = int(re.search('-(\d+?)-', s).group(1).strip()) play_num = int(re.search('-\d+?-(\d+?)\.html', s).group(1).strip()) if num > max_num: max_num = num if play_num > max_play: max_play = play_num # 最大资源数 source_num = max_num + 1 # 最大集数,有些资源更新慢集数不足,弃用 self.play_num = max_play + 1 all_play_list = [] for i in range(source_num): soruce_list = [] for s in all_source: # 资源分类 cate = int(re.search('-(\d+?)-', s).group(1).strip()) if cate == i: soruce_list.append(s) # 获取集数最大的所有资源 if len(soruce_list) == self.play_num: all_play_list.append(soruce_list) # 弃用集数不足的资源 logging.info('[共有资源]%s[项],[每项资源]%s[集]', len(all_play_list), self.play_num) return all_play_list
def get_play_urls(self): # 进入详情页,找到播放页链接 resp = get_response(self.detail_url) if not resp: return "ERROR, 请求失败" play_page_urls = re.findall( 'a title=.*? href=\'(.*?)\' target="_self"', resp) return play_page_urls
def down_m3u8_thread(url, file_name, host=None, headers=None): mkdir() file_name = file_name + '.mp4' logging.info('[url] %s [file_name] %s', url, file_name) host = host # 预下载,获取链接并写文件 resp = get_response(url) m3u8_text = resp # 开链接队列 ts_queue = Queue(10000) lines = m3u8_text.split('\n') concatfile = 'cache/' + "s" + '.txt' for i, line in enumerate(lines): if '.ts' in line: if 'http' in line: ts_queue.put(line) else: if line[0] == '/': line = host + line else: line = host + '/' + line ts_queue.put(line) filename = re.search('([a-zA-Z0-9-_]+.ts)', line).group(1).strip() open(concatfile, 'a+').write('file %s\n' % filename) num = ts_queue.qsize() logging.info('[下载开始,队列任务数:] %s', num) if num > 5: t_num = num // 5 else: t_num = 1 if t_num > 50: t_num = 50 threads = [] logging.info('下载开始') for i in range(t_num): t = threading.Thread(target=down, name='th-' + str(i), kwargs={ 'ts_queue': ts_queue, 'headers': headers }) t.setDaemon(True) threads.append(t) for t in threads: logging.info('[线程开始]') time.sleep(0.4) t.start() for t in threads: logging.info('[线程停止]') t.join() logging.info('下载完成,合并开始') merge(concatfile, file_name) logging.info('合并完成,删除冗余文件') remove() result = getLength(file_name) return result
def get_many_real_url(self): for i, play_list in enumerate(self.all_play_list): many_data_url = [] many_real_url = [] # logging.info('[当前资源第:] %s/%s [项]', i + 1, self.source_num) for j, play_page_url in enumerate(play_list): # logging.info('[当前解析第:] %s/%s [集]', j + 1, self.play_num) play_page_url = 'https://www.yszxwang.com' + play_page_url # logging.info('[播放页链接:] %s', play_page_url) resp1 = get_response(play_page_url) if resp1: data_url = re.search('var now="(http.*?)"', resp1).group(1).strip() # logging.info('[数据链接:] %s', data_url) resp2 = get_response(data_url) many_data_url.append(data_url) if resp2: u2 = '' if 'm3u8' in data_url: if self.type == 'w': break host = re.sub('index.*', '', data_url) resp3 = get_response(data_url) if resp3: m3u8text = resp3.split('\n') for text in m3u8text: if 'm3u8' in text: u2 = text else: # host = re.search('var redirecturl = "(http.*?)"', resp2).group(1).strip() s = data_url.split('/') host1 = s[0] + '//' + s[2] u1 = re.search('var main = "(.*?)"', resp2).group(1).strip() m3u8_url1 = host1 + u1 # logging.info('[第一个m3u8:] %s', m3u8_url1) host = re.sub('index.*', '', m3u8_url1) # 读取第一个m3u8链接,获取真实m3u8链接 resp3 = get_response(m3u8_url1) if resp3: m3u8text = resp3.split('\n') for text in m3u8text: if 'm3u8' in text: u2 = text else: break if u2: if u2[0] == '/': real_url = host + u2[1:] else: real_url = host + u2 if not real_url: break # logging.info('[真实m3u8:] %s', real_url) resp = get_response(real_url) # 简单测试链接可用性 if resp: many_real_url.append(real_url) else: break else: break else: break if len(many_real_url) == self.play_num and len( many_data_url) == self.play_num: return many_real_url, many_data_url return 'defeat', 'defeat'