def detail_one_page(detail_url): response = get_response(detail_url) if Comic.current_host_key in ('momomh', ): response = get_response(detail_url, header={ 'User-Agent': random.choice( settings.ua['android']) }) if Comic.current_host_key in ('haimaoba', 'pufei8', 'taduo'): response = get_response(detail_url, encoding='gbk') pq = pyquery.PyQuery(response.text) lis = pq(rule.get('detail_lis')) comic_title = pq(rule.get('comic_title')).text() if '最终话' in Comic.exclude_character(comic_title): comic_title = Comic.exclude_character(comic_title + '(完结)') if Comic.current_host_key == '18comic2': comic_title = comic_title[(len(comic_title) // 2) + 1:] if Comic.current_host_key == '18comic2': if not lis.length: detail_dict = { 'chapter': '共一话', 'chapter_url': host_url + pq('div.read-block a:first-child').attr( 'href').lstrip('/'), 'comic_title': Comic.exclude_character(comic_title) + '(完结)', } Comic.detail_dicts.append(detail_dict) return Comic.detail_dicts for li in lis: chapter = pyquery.PyQuery(li)('a').text() # 重构章节名称 001+标题 if Comic.current_host_key in Comic.is_serial_number: if host_key in ('qq', 'manhuaniu', '36mh', 'kanman'): chapter = str(lis.index(li) + 1).rjust( 3, '0') + " " + chapter else: chapter = str(lis[::-1].index(li) + 1).rjust( 3, '0') + " " + chapter chapter_url = pyquery.PyQuery(li)('a').attr('href') if Comic.current_host_key == 'dongmanmanhua': chapter = chapter.split('・')[0] detail_dict = { 'chapter': Comic.exclude_character(chapter), 'chapter_url': host_url + chapter_url.lstrip('/') if host_key not in chapter_url else chapter_url, 'comic_title': Comic.exclude_character(comic_title), } Comic.detail_dicts.append(detail_dict)
def _cswhcs(pq): def is_next_url(): next_url = '' fanye = pq('div.fanye') if '下一页' in fanye.text(): next_url = pyquery.PyQuery(fanye)('a:nth-last-child(2)').attr('href') if next_url: next_url = 'https://cswhcs.com' + next_url else: next_url = None return next_url images_url = [] next_url = is_next_url() while next_url: print(next_url) if next_url: response = get_response(next_url) pq = pyquery.PyQuery(response.text) divs = pq(Comic.rules_dict.get(Comic.current_host_key).get('comic_pages')) for div in divs: img_src = pyquery.PyQuery(div)('img').attr('data-original') if not img_src: img_src = pyquery.PyQuery(div)('img').attr('src') images_url.append(img_src) # 判断是否还有下一页 next_url = is_next_url() return images_url
def detail_one_page(detail_url): response = get_response(detail_url) pq = pyquery.PyQuery(response.text) lis = pq(rule.get('detail_lis')) comic_title = pq(rule.get('comic_title')).text() if Comic.current_host_key == '18comic': if not lis.length: detail_dict = { 'a_title': '共一话', 'a_href': host_url + pq('div.read-block a:first-child').attr('href').lstrip('/'), 'comic_title': comic_title, } Comic.detail_dicts.append(detail_dict) return Comic.detail_dicts else: print(f'该漫画共{len(lis)}章节') for li in lis: a_title = pyquery.PyQuery(li)('a').text() a_href = pyquery.PyQuery(li)('a').attr('href') for ch in r'\/:| <.・>?*"': a_title = a_title.replace(ch, '・') # 去除特殊字符 comic_title = comic_title.replace(ch, '・') # 去除特殊字符 if Comic.current_host_key == 'dongmanmanhua': a_title = a_title.split('・')[0] detail_dict = { 'a_title': a_title, 'a_href': host_url + a_href.lstrip('/') if host_key not in a_href else a_href, 'comic_title': comic_title, } Comic.detail_dicts.append(detail_dict)
def _733(detail_url): response = get_response(detail_url) qTcms_S_m_murl_e = re.findall('var qTcms_S_m_murl_e="(.*?)"', response.text)[0] images_url = str(base64.b64decode(qTcms_S_m_murl_e)).strip('b').strip( "'").split("$qingtiandy$") return images_url
def _nxueli(detail_url, current_host_key): response = get_response(detail_url) chapter_path_regix = 'chapterPath = "(.*?)"' chapter_images = eval( re.sub( r'\\', '', re.search('chapterImages = (\[.*?\])', response.text).group(1))) if current_host_key == 'nxueli': return ['https://images.nxueli.com' + i for i in chapter_images] elif current_host_key == '90ff': chapter_path = re.search(chapter_path_regix, response.text).group(1) return [ f'http://90ff.bfdblg.com/{chapter_path}' + i for i in chapter_images ] elif current_host_key == 'mh1234': chapter_path = re.search(chapter_path_regix, response.text).group(1) return [ f'https://img.wszwhg.net/{chapter_path}' + i for i in chapter_images ] elif current_host_key == '36mh': chapter_path = re.search(chapter_path_regix, response.text).group(1) return [ f'https://img001.shmkks.com/{chapter_path}' + i for i in chapter_images ] elif current_host_key == 'manhuaniu': return [ 'https://restp.dongqiniqin.com/' + i for i in chapter_images ]
def _cocomanhua(detail_url): response = get_response(detail_url) data = re.findall('var C_DATA.*?\'(.*?)\'', response.text)[0] ctx = execjs.get().compile(open('js/_cocomanhua.js', encoding='utf-8').read(), cwd='js/node_modules') images_url = ctx.eval(f'getArr("{data}")') return images_url
def _happymh(detail_url): response = get_response(detail_url) ss = re.findall('var ss = ({.*?});', response.text)[0].replace('\\', '') ctx = execjs.get().compile(open('js/_m_happymh.js').read(), cwd='js/node_modules') data = ctx.call('getArr', eval(ss)) image_url = [d['url'] for d in data] return image_url
def get_images_url(detail_dict: dict) -> dict: images_url = [] nxueli_type = ('nxueli', '90ff', 'manhuaniu', '36mh', 'mh1234') cswhcs_type = ('cswhcs', 'kanleying', 'qinqinmh') mm820_type = ('mm820', 'hanmzj') detail_url = detail_dict.get('a_href') a_title = detail_dict.get('a_title') comic_title = detail_dict.get('comic_title') if Comic.current_host_key == 'dongmanmanhua': images_url = Comic._dongmanmanhua(detail_url) return {'images_url': images_url, 'a_title': a_title, 'comic_title': comic_title} elif Comic.current_host_key in nxueli_type: images_url = Comic._nxueli(detail_url) return {'images_url': images_url, 'a_title': a_title, 'comic_title': comic_title} elif Comic.current_host_key == 'happymh': response = get_response(detail_url) imgs_url = eval(re.search('var scans = (.*?);', response.text).group(1)) imgs_url = [i for i in imgs_url if isinstance(i, dict)] for img_url in imgs_url: img_src = img_url['url'] images_url.append(img_src) return {'images_url': images_url, 'a_title': a_title, 'comic_title': comic_title} response = get_response(detail_url) pq = pyquery.PyQuery(response.text) divs = pq(Comic.rules_dict.get(Comic.current_host_key).get('comic_pages')) for div in divs: img_src = pyquery.PyQuery(div)('img').attr('data-original') if not img_src: img_src = pyquery.PyQuery(div)('img').attr('src') images_url.append(img_src) # 处理特殊情况 if Comic.current_host_key in cswhcs_type: images_url.extend(Comic._cswhcs(pq)) if Comic.current_host_key in mm820_type: # 获取分页数 pages = len(pq('.selectpage option')) images_url.extend(Comic._mm820(detail_url, pages)) if Comic.current_host_key == '18comic': images_url = [img for img in images_url if img] return {'images_url': images_url, 'a_title': a_title, 'comic_title': comic_title}
def _dongmanmanhua(detail_url, ): images_url = [] detail_url = 'https:' + detail_url response = get_response(detail_url) pq = pyquery.PyQuery(response.text) imgs = pq(pc_rules_dict.get('dongmanmanhua').get('comic_pages')) for img in imgs: image_url = pyquery.PyQuery(img).attr('data-url') images_url.append(image_url) return images_url
def _mm820(detail_url, pages: int): images_url = [] for i in range(2, pages + 1): response = get_response(detail_url + f'?page={i}') pq = pyquery.PyQuery(response.text) divs = pq(pc_rules_dict.get('mm820').get('comic_pages')) for div in divs: img_src = pyquery.PyQuery(div)('img').attr('data-original') if not img_src: img_src = pyquery.PyQuery(div)('img').attr('src') images_url.append(img_src) return images_url
def parse_images_url(url): if Comic.current_host_key in ('nonomh', 'qinqinmh', 'wzdhm'): header = { 'User-Agent': random.choice(settings.ua['android']), } response = get_response(url, header=header) else: response = get_response(url) pq = pyquery.PyQuery(response.text) divs = pq( settings.pc_rules_dict.get( Comic.current_host_key).get('comic_pages')) for div in divs: if Comic.current_host_key in ('nonomh', 'qinqinmh', 'wzdhm'): img_src = pyquery.PyQuery(div).attr('data-original') else: img_src = pyquery.PyQuery(div)('img').attr('data-original') if not img_src: img_src = pyquery.PyQuery(div)('img').attr('src') images_url.append(img_src) return pq
def get_detail_dicts(url, host_url, host_key) -> list: response = get_response(url) pq = pyquery.PyQuery(response.text) Comic.current_host_key = host_key rule = Comic.rules_dict.get(host_key, '') if not rule: raise KeyError('该网站还没/有适配') if Comic.current_host_key == 'happymh': comic_title = pq(rule.get('comic_title')).text() return Comic._happymh(response, comic_title) def detail_one_page(detail_url): response = get_response(detail_url) pq = pyquery.PyQuery(response.text) lis = pq(rule.get('detail_lis')) comic_title = pq(rule.get('comic_title')).text() if Comic.current_host_key == '18comic': if not lis.length: detail_dict = { 'a_title': '共一话', 'a_href': host_url + pq('div.read-block a:first-child').attr('href').lstrip('/'), 'comic_title': comic_title, } Comic.detail_dicts.append(detail_dict) return Comic.detail_dicts else: print(f'该漫画共{len(lis)}章节') for li in lis: a_title = pyquery.PyQuery(li)('a').text() a_href = pyquery.PyQuery(li)('a').attr('href') for ch in r'\/:| <.・>?*"': a_title = a_title.replace(ch, '・') # 去除特殊字符 comic_title = comic_title.replace(ch, '・') # 去除特殊字符 if Comic.current_host_key == 'dongmanmanhua': a_title = a_title.split('・')[0] detail_dict = { 'a_title': a_title, 'a_href': host_url + a_href.lstrip('/') if host_key not in a_href else a_href, 'comic_title': comic_title, } Comic.detail_dicts.append(detail_dict) detail_one_page(url) # 处理特殊情况 pyquery 好像不支持nth-child(n+3)这种类型过滤 if Comic.current_host_key == 'hmba': Comic.detail_dicts = Comic.detail_dicts[9:] if Comic.current_host_key == 'dongmanmanhua': total_pages = len(pq('.paginate a')) for i in range(2, total_pages + 1): detail_one_page(url + f'&page={i}') Comic.detail_dicts.reverse() return Comic.detail_dicts
def _ac_qq(detail_url): # 如果需要配置cookie请按下面配置 # headers = { # 'cookie': # } response = get_response(detail_url) N = re.findall('window\["n.*?e"\]\s=\s(.*?);', response.text)[1] T = re.findall('var DATA.*?\'(.*?)\'', response.text)[0] ctx = execjs.get().compile(open('js/_ac_qq.js', encoding='gbk').read(), cwd='js/node_modules') data = ctx.call('getArr', T, N) images_url = [picture['url'] for picture in data['picture']] return images_url
def get_images_url(detail_url): response = get_response(detail_url) packed = re.findall('packed="(.*)";', response.text)[0] ctx = execjs.get().compile(open('js/_pufei8.js').read()) img_urls = ctx.eval(f'getArr("{packed}")')[1:] if 'http' in img_urls[0]: return img_urls if 'taduo' in detail_url: img_servers = ["http://mh.jiduo.cc/"] else: img_servers = ['http://res.img.jituoli.com/', 'http://res.img.fffmanhua.com/'] images_url = [random.choice(img_servers) + i for i in img_urls] return images_url
def _momomh(detail_url): header = { 'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1 Edg/85.0.4183.83', } response = get_response(detail_url, header=header) load_conf = re.findall('loadConf = ({.*?})', response.text, re.S)[0].strip('\n') word = ['i:', 'c:', 'k:', 'd:', 'l:', 'f:'] for i in word: load_conf = load_conf.replace(i, l := f'"{i[0]}":') ctx = execjs.get().compile(open('js/_momomh.js').read(), cwd='js/node_modules') data = ctx.call('getArr', eval(load_conf)) image_url = [url.strip('_w_720') for url in data] return image_url
def get_one_page_img(detail_url): response = get_response(detail_url) pq = pyquery.PyQuery(response.text) divs = pq(pc_rules_dict.get('haimaoba').get('comic_pages')) for div in divs: img_src = pyquery.PyQuery(div)('img').attr('src') images_url.append(img_src) fanye = pq('.fanye1 a') if fanye: next_url = pyquery.PyQuery(fanye).attr('href') next_url = 'http://m.haimaoba.com' + next_url else: next_url = None return next_url
def get_detail_dicts(url, host_url, host_key) -> list: if host_key in ('momomh'): url = url.replace('www', 'm', 1) print(f'----------{url}-------------') response = get_response(url) pq = pyquery.PyQuery(response.text) Comic.current_host_key = host_key rule = settings.pc_rules_dict.get(host_key, '') if not rule: raise KeyError(f'{host_url}---->请先在setting文件配置改网站') if Comic.current_host_key == 'kanman': from kanman_com import Kanman return Kanman._kanman(url) if Comic.current_host_key == 'happymh': chapter_list = re.findall('"chapterList":(.*),', response.text)[0] chapter_list = eval( chapter_list.replace('false', '0').replace('true', '1')) for d in chapter_list: detail_dict = { 'chapter': Comic.exclude_character(d['chapterName']), 'chapter_url': url + '/' + d['id'], 'comic_title': pq('.mg-title').text(), } Comic.detail_dicts.append(detail_dict) return Comic.detail_dicts[::-1] def detail_one_page(detail_url): response = get_response(detail_url) if Comic.current_host_key in ('momomh', ): response = get_response(detail_url, header={ 'User-Agent': random.choice( settings.ua['android']) }) if Comic.current_host_key in ('haimaoba', 'pufei8', 'taduo'): response = get_response(detail_url, encoding='gbk') pq = pyquery.PyQuery(response.text) lis = pq(rule.get('detail_lis')) comic_title = pq(rule.get('comic_title')).text() if '最终话' in Comic.exclude_character(comic_title): comic_title = Comic.exclude_character(comic_title + '(完结)') if Comic.current_host_key == '18comic2': comic_title = comic_title[(len(comic_title) // 2) + 1:] if Comic.current_host_key == '18comic2': if not lis.length: detail_dict = { 'chapter': '共一话', 'chapter_url': host_url + pq('div.read-block a:first-child').attr( 'href').lstrip('/'), 'comic_title': Comic.exclude_character(comic_title) + '(完结)', } Comic.detail_dicts.append(detail_dict) return Comic.detail_dicts for li in lis: chapter = pyquery.PyQuery(li)('a').text() # 重构章节名称 001+标题 if Comic.current_host_key in Comic.is_serial_number: if host_key in ('qq', 'manhuaniu', '36mh', 'kanman'): chapter = str(lis.index(li) + 1).rjust( 3, '0') + " " + chapter else: chapter = str(lis[::-1].index(li) + 1).rjust( 3, '0') + " " + chapter chapter_url = pyquery.PyQuery(li)('a').attr('href') if Comic.current_host_key == 'dongmanmanhua': chapter = chapter.split('・')[0] detail_dict = { 'chapter': Comic.exclude_character(chapter), 'chapter_url': host_url + chapter_url.lstrip('/') if host_key not in chapter_url else chapter_url, 'comic_title': Comic.exclude_character(comic_title), } Comic.detail_dicts.append(detail_dict) detail_one_page(url) # 处理特殊情况 pyquery 好像不支持nth-child(n+3)这种类型过滤 if Comic.current_host_key == 'hmba': Comic.detail_dicts = Comic.detail_dicts[9:] if Comic.current_host_key == 'dongmanmanhua': total_pages = len(pq('.paginate a')) for i in range(2, total_pages + 1): detail_one_page(url + f'&page={i}') Comic.detail_dicts.reverse() if Comic.current_host_key == '18comic2': try: if len(Comic.detail_dicts[0]['chapter']) > len( Comic.detail_dicts[1]['chapter']): Comic.detail_dicts[0]['chapter'] = Comic.detail_dicts[0][ 'chapter'].replace( Comic.detail_dicts[0]['comic_title'], '1 ') Comic.detail_dicts[-1]['chapter'] = Comic.detail_dicts[-1][ 'chapter'].replace('最新', '') except IndexError: print('短篇漫画') return Comic.detail_dicts if Comic.current_host_key in ('733', 'pufei8', 'taduo', 'cocomanhua'): return Comic.detail_dicts[::-1] return Comic.detail_dicts