Beispiel #1
0
 def detail_one_page(detail_url):
     response = get_response(detail_url)
     if Comic.current_host_key in ('momomh', ):
         response = get_response(detail_url,
                                 header={
                                     'User-Agent':
                                     random.choice(
                                         settings.ua['android'])
                                 })
     if Comic.current_host_key in ('haimaoba', 'pufei8', 'taduo'):
         response = get_response(detail_url, encoding='gbk')
     pq = pyquery.PyQuery(response.text)
     lis = pq(rule.get('detail_lis'))
     comic_title = pq(rule.get('comic_title')).text()
     if '最终话' in Comic.exclude_character(comic_title):
         comic_title = Comic.exclude_character(comic_title + '(完结)')
     if Comic.current_host_key == '18comic2':
         comic_title = comic_title[(len(comic_title) // 2) + 1:]
     if Comic.current_host_key == '18comic2':
         if not lis.length:
             detail_dict = {
                 'chapter':
                 '共一话',
                 'chapter_url':
                 host_url + pq('div.read-block a:first-child').attr(
                     'href').lstrip('/'),
                 'comic_title':
                 Comic.exclude_character(comic_title) + '(完结)',
             }
             Comic.detail_dicts.append(detail_dict)
             return Comic.detail_dicts
     for li in lis:
         chapter = pyquery.PyQuery(li)('a').text()
         # 重构章节名称 001+标题
         if Comic.current_host_key in Comic.is_serial_number:
             if host_key in ('qq', 'manhuaniu', '36mh', 'kanman'):
                 chapter = str(lis.index(li) + 1).rjust(
                     3, '0') + " " + chapter
             else:
                 chapter = str(lis[::-1].index(li) + 1).rjust(
                     3, '0') + " " + chapter
         chapter_url = pyquery.PyQuery(li)('a').attr('href')
         if Comic.current_host_key == 'dongmanmanhua':
             chapter = chapter.split('・')[0]
         detail_dict = {
             'chapter':
             Comic.exclude_character(chapter),
             'chapter_url':
             host_url + chapter_url.lstrip('/')
             if host_key not in chapter_url else chapter_url,
             'comic_title':
             Comic.exclude_character(comic_title),
         }
         Comic.detail_dicts.append(detail_dict)
Beispiel #2
0
    def _cswhcs(pq):
        def is_next_url():
            next_url = ''
            fanye = pq('div.fanye')
            if '下一页' in fanye.text():
                next_url = pyquery.PyQuery(fanye)('a:nth-last-child(2)').attr('href')
            if next_url:
                next_url = 'https://cswhcs.com' + next_url
            else:
                next_url = None
            return next_url

        images_url = []
        next_url = is_next_url()
        while next_url:
            print(next_url)
            if next_url:
                response = get_response(next_url)
                pq = pyquery.PyQuery(response.text)
                divs = pq(Comic.rules_dict.get(Comic.current_host_key).get('comic_pages'))
                for div in divs:
                    img_src = pyquery.PyQuery(div)('img').attr('data-original')
                    if not img_src:
                        img_src = pyquery.PyQuery(div)('img').attr('src')
                    images_url.append(img_src)
                # 判断是否还有下一页
            next_url = is_next_url()
        return images_url
Beispiel #3
0
 def detail_one_page(detail_url):
     response = get_response(detail_url)
     pq = pyquery.PyQuery(response.text)
     lis = pq(rule.get('detail_lis'))
     comic_title = pq(rule.get('comic_title')).text()
     if Comic.current_host_key == '18comic':
         if not lis.length:
             detail_dict = {
                 'a_title': '共一话',
                 'a_href': host_url + pq('div.read-block a:first-child').attr('href').lstrip('/'),
                 'comic_title': comic_title,
             }
             Comic.detail_dicts.append(detail_dict)
             return Comic.detail_dicts
         else:
             print(f'该漫画共{len(lis)}章节')
     for li in lis:
         a_title = pyquery.PyQuery(li)('a').text()
         a_href = pyquery.PyQuery(li)('a').attr('href')
         for ch in r'\/:| <.・>?*"':
             a_title = a_title.replace(ch, '・')  # 去除特殊字符
             comic_title = comic_title.replace(ch, '・')  # 去除特殊字符
         if Comic.current_host_key == 'dongmanmanhua':
             a_title = a_title.split('・')[0]
         detail_dict = {
             'a_title': a_title,
             'a_href': host_url + a_href.lstrip('/') if host_key not in a_href else a_href,
             'comic_title': comic_title,
         }
         Comic.detail_dicts.append(detail_dict)
Beispiel #4
0
 def _733(detail_url):
     response = get_response(detail_url)
     qTcms_S_m_murl_e = re.findall('var qTcms_S_m_murl_e="(.*?)"',
                                   response.text)[0]
     images_url = str(base64.b64decode(qTcms_S_m_murl_e)).strip('b').strip(
         "'").split("$qingtiandy$")
     return images_url
Beispiel #5
0
 def _nxueli(detail_url, current_host_key):
     response = get_response(detail_url)
     chapter_path_regix = 'chapterPath = "(.*?)"'
     chapter_images = eval(
         re.sub(
             r'\\', '',
             re.search('chapterImages = (\[.*?\])',
                       response.text).group(1)))
     if current_host_key == 'nxueli':
         return ['https://images.nxueli.com' + i for i in chapter_images]
     elif current_host_key == '90ff':
         chapter_path = re.search(chapter_path_regix,
                                  response.text).group(1)
         return [
             f'http://90ff.bfdblg.com/{chapter_path}' + i
             for i in chapter_images
         ]
     elif current_host_key == 'mh1234':
         chapter_path = re.search(chapter_path_regix,
                                  response.text).group(1)
         return [
             f'https://img.wszwhg.net/{chapter_path}' + i
             for i in chapter_images
         ]
     elif current_host_key == '36mh':
         chapter_path = re.search(chapter_path_regix,
                                  response.text).group(1)
         return [
             f'https://img001.shmkks.com/{chapter_path}' + i
             for i in chapter_images
         ]
     elif current_host_key == 'manhuaniu':
         return [
             'https://restp.dongqiniqin.com/' + i for i in chapter_images
         ]
 def _cocomanhua(detail_url):
     response = get_response(detail_url)
     data = re.findall('var C_DATA.*?\'(.*?)\'', response.text)[0]
     ctx = execjs.get().compile(open('js/_cocomanhua.js',
                                     encoding='utf-8').read(),
                                cwd='js/node_modules')
     images_url = ctx.eval(f'getArr("{data}")')
     return images_url
Beispiel #7
0
 def _happymh(detail_url):
     response = get_response(detail_url)
     ss = re.findall('var ss = ({.*?});',
                     response.text)[0].replace('\\', '')
     ctx = execjs.get().compile(open('js/_m_happymh.js').read(),
                                cwd='js/node_modules')
     data = ctx.call('getArr', eval(ss))
     image_url = [d['url'] for d in data]
     return image_url
Beispiel #8
0
    def get_images_url(detail_dict: dict) -> dict:
        images_url = []
        nxueli_type = ('nxueli', '90ff', 'manhuaniu', '36mh', 'mh1234')
        cswhcs_type = ('cswhcs', 'kanleying', 'qinqinmh')
        mm820_type = ('mm820', 'hanmzj')

        detail_url = detail_dict.get('a_href')
        a_title = detail_dict.get('a_title')
        comic_title = detail_dict.get('comic_title')
        if Comic.current_host_key == 'dongmanmanhua':
            images_url = Comic._dongmanmanhua(detail_url)
            return {'images_url': images_url, 'a_title': a_title, 'comic_title': comic_title}

        elif Comic.current_host_key in nxueli_type:
            images_url = Comic._nxueli(detail_url)
            return {'images_url': images_url, 'a_title': a_title, 'comic_title': comic_title}
        elif Comic.current_host_key == 'happymh':
            response = get_response(detail_url)
            imgs_url = eval(re.search('var scans = (.*?);', response.text).group(1))
            imgs_url = [i for i in imgs_url if isinstance(i, dict)]
            for img_url in imgs_url:
                img_src = img_url['url']
                images_url.append(img_src)
            return {'images_url': images_url, 'a_title': a_title, 'comic_title': comic_title}
        response = get_response(detail_url)
        pq = pyquery.PyQuery(response.text)
        divs = pq(Comic.rules_dict.get(Comic.current_host_key).get('comic_pages'))
        for div in divs:
            img_src = pyquery.PyQuery(div)('img').attr('data-original')
            if not img_src:
                img_src = pyquery.PyQuery(div)('img').attr('src')
            images_url.append(img_src)
        #  处理特殊情况
        if Comic.current_host_key in cswhcs_type:
            images_url.extend(Comic._cswhcs(pq))
        if Comic.current_host_key in mm820_type:
            # 获取分页数
            pages = len(pq('.selectpage option'))
            images_url.extend(Comic._mm820(detail_url, pages))
        if Comic.current_host_key == '18comic':
            images_url = [img for img in images_url if img]

        return {'images_url': images_url, 'a_title': a_title, 'comic_title': comic_title}
Beispiel #9
0
 def _dongmanmanhua(detail_url, ):
     images_url = []
     detail_url = 'https:' + detail_url
     response = get_response(detail_url)
     pq = pyquery.PyQuery(response.text)
     imgs = pq(pc_rules_dict.get('dongmanmanhua').get('comic_pages'))
     for img in imgs:
         image_url = pyquery.PyQuery(img).attr('data-url')
         images_url.append(image_url)
     return images_url
Beispiel #10
0
 def _mm820(detail_url, pages: int):
     images_url = []
     for i in range(2, pages + 1):
         response = get_response(detail_url + f'?page={i}')
         pq = pyquery.PyQuery(response.text)
         divs = pq(pc_rules_dict.get('mm820').get('comic_pages'))
         for div in divs:
             img_src = pyquery.PyQuery(div)('img').attr('data-original')
             if not img_src:
                 img_src = pyquery.PyQuery(div)('img').attr('src')
             images_url.append(img_src)
     return images_url
Beispiel #11
0
 def parse_images_url(url):
     if Comic.current_host_key in ('nonomh', 'qinqinmh', 'wzdhm'):
         header = {
             'User-Agent': random.choice(settings.ua['android']),
         }
         response = get_response(url, header=header)
     else:
         response = get_response(url)
     pq = pyquery.PyQuery(response.text)
     divs = pq(
         settings.pc_rules_dict.get(
             Comic.current_host_key).get('comic_pages'))
     for div in divs:
         if Comic.current_host_key in ('nonomh', 'qinqinmh', 'wzdhm'):
             img_src = pyquery.PyQuery(div).attr('data-original')
         else:
             img_src = pyquery.PyQuery(div)('img').attr('data-original')
             if not img_src:
                 img_src = pyquery.PyQuery(div)('img').attr('src')
         images_url.append(img_src)
     return pq
Beispiel #12
0
    def get_detail_dicts(url, host_url, host_key) -> list:
        response = get_response(url)
        pq = pyquery.PyQuery(response.text)
        Comic.current_host_key = host_key
        rule = Comic.rules_dict.get(host_key, '')
        if not rule: raise KeyError('该网站还没/有适配')

        if Comic.current_host_key == 'happymh':
            comic_title = pq(rule.get('comic_title')).text()
            return Comic._happymh(response, comic_title)

        def detail_one_page(detail_url):
            response = get_response(detail_url)
            pq = pyquery.PyQuery(response.text)
            lis = pq(rule.get('detail_lis'))
            comic_title = pq(rule.get('comic_title')).text()
            if Comic.current_host_key == '18comic':
                if not lis.length:
                    detail_dict = {
                        'a_title': '共一话',
                        'a_href': host_url + pq('div.read-block a:first-child').attr('href').lstrip('/'),
                        'comic_title': comic_title,
                    }
                    Comic.detail_dicts.append(detail_dict)
                    return Comic.detail_dicts
                else:
                    print(f'该漫画共{len(lis)}章节')
            for li in lis:
                a_title = pyquery.PyQuery(li)('a').text()
                a_href = pyquery.PyQuery(li)('a').attr('href')
                for ch in r'\/:| <.・>?*"':
                    a_title = a_title.replace(ch, '・')  # 去除特殊字符
                    comic_title = comic_title.replace(ch, '・')  # 去除特殊字符
                if Comic.current_host_key == 'dongmanmanhua':
                    a_title = a_title.split('・')[0]
                detail_dict = {
                    'a_title': a_title,
                    'a_href': host_url + a_href.lstrip('/') if host_key not in a_href else a_href,
                    'comic_title': comic_title,
                }
                Comic.detail_dicts.append(detail_dict)

        detail_one_page(url)
        # 处理特殊情况 pyquery 好像不支持nth-child(n+3)这种类型过滤
        if Comic.current_host_key == 'hmba':
            Comic.detail_dicts = Comic.detail_dicts[9:]
        if Comic.current_host_key == 'dongmanmanhua':
            total_pages = len(pq('.paginate a'))
            for i in range(2, total_pages + 1):
                detail_one_page(url + f'&page={i}')
            Comic.detail_dicts.reverse()
        return Comic.detail_dicts
Beispiel #13
0
 def _ac_qq(detail_url):
     # 如果需要配置cookie请按下面配置
     # headers = {
     #     'cookie':
     # }
     response = get_response(detail_url)
     N = re.findall('window\["n.*?e"\]\s=\s(.*?);', response.text)[1]
     T = re.findall('var DATA.*?\'(.*?)\'', response.text)[0]
     ctx = execjs.get().compile(open('js/_ac_qq.js', encoding='gbk').read(),
                                cwd='js/node_modules')
     data = ctx.call('getArr', T, N)
     images_url = [picture['url'] for picture in data['picture']]
     return images_url
Beispiel #14
0
 def get_images_url(detail_url):
     response = get_response(detail_url)
     packed = re.findall('packed="(.*)";', response.text)[0]
     ctx = execjs.get().compile(open('js/_pufei8.js').read())
     img_urls = ctx.eval(f'getArr("{packed}")')[1:]
     if 'http' in img_urls[0]:
         return img_urls
     if 'taduo' in detail_url:
         img_servers = ["http://mh.jiduo.cc/"]
     else:
         img_servers = ['http://res.img.jituoli.com/', 'http://res.img.fffmanhua.com/']
     images_url = [random.choice(img_servers) + i for i in img_urls]
     return images_url
Beispiel #15
0
 def _momomh(detail_url):
     header = {
         'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1 Edg/85.0.4183.83',
     }
     response = get_response(detail_url, header=header)
     load_conf = re.findall('loadConf    =  ({.*?})', response.text, re.S)[0].strip('\n')
     word = ['i:', 'c:', 'k:', 'd:', 'l:', 'f:']
     for i in word:
         load_conf = load_conf.replace(i, l := f'"{i[0]}":')
     ctx = execjs.get().compile(open('js/_momomh.js').read(), cwd='js/node_modules')
     data = ctx.call('getArr', eval(load_conf))
     image_url = [url.strip('_w_720') for url in data]
     return image_url
Beispiel #16
0
 def get_one_page_img(detail_url):
     response = get_response(detail_url)
     pq = pyquery.PyQuery(response.text)
     divs = pq(pc_rules_dict.get('haimaoba').get('comic_pages'))
     for div in divs:
         img_src = pyquery.PyQuery(div)('img').attr('src')
         images_url.append(img_src)
     fanye = pq('.fanye1 a')
     if fanye:
         next_url = pyquery.PyQuery(fanye).attr('href')
         next_url = 'http://m.haimaoba.com' + next_url
     else:
         next_url = None
     return next_url
Beispiel #17
0
    def get_detail_dicts(url, host_url, host_key) -> list:
        if host_key in ('momomh'):
            url = url.replace('www', 'm', 1)
            print(f'----------{url}-------------')
        response = get_response(url)
        pq = pyquery.PyQuery(response.text)
        Comic.current_host_key = host_key
        rule = settings.pc_rules_dict.get(host_key, '')
        if not rule: raise KeyError(f'{host_url}---->请先在setting文件配置改网站')

        if Comic.current_host_key == 'kanman':
            from kanman_com import Kanman
            return Kanman._kanman(url)

        if Comic.current_host_key == 'happymh':
            chapter_list = re.findall('"chapterList":(.*),', response.text)[0]
            chapter_list = eval(
                chapter_list.replace('false', '0').replace('true', '1'))
            for d in chapter_list:
                detail_dict = {
                    'chapter': Comic.exclude_character(d['chapterName']),
                    'chapter_url': url + '/' + d['id'],
                    'comic_title': pq('.mg-title').text(),
                }
                Comic.detail_dicts.append(detail_dict)
            return Comic.detail_dicts[::-1]

        def detail_one_page(detail_url):
            response = get_response(detail_url)
            if Comic.current_host_key in ('momomh', ):
                response = get_response(detail_url,
                                        header={
                                            'User-Agent':
                                            random.choice(
                                                settings.ua['android'])
                                        })
            if Comic.current_host_key in ('haimaoba', 'pufei8', 'taduo'):
                response = get_response(detail_url, encoding='gbk')
            pq = pyquery.PyQuery(response.text)
            lis = pq(rule.get('detail_lis'))
            comic_title = pq(rule.get('comic_title')).text()
            if '最终话' in Comic.exclude_character(comic_title):
                comic_title = Comic.exclude_character(comic_title + '(完结)')
            if Comic.current_host_key == '18comic2':
                comic_title = comic_title[(len(comic_title) // 2) + 1:]
            if Comic.current_host_key == '18comic2':
                if not lis.length:
                    detail_dict = {
                        'chapter':
                        '共一话',
                        'chapter_url':
                        host_url + pq('div.read-block a:first-child').attr(
                            'href').lstrip('/'),
                        'comic_title':
                        Comic.exclude_character(comic_title) + '(完结)',
                    }
                    Comic.detail_dicts.append(detail_dict)
                    return Comic.detail_dicts
            for li in lis:
                chapter = pyquery.PyQuery(li)('a').text()
                # 重构章节名称 001+标题
                if Comic.current_host_key in Comic.is_serial_number:
                    if host_key in ('qq', 'manhuaniu', '36mh', 'kanman'):
                        chapter = str(lis.index(li) + 1).rjust(
                            3, '0') + " " + chapter
                    else:
                        chapter = str(lis[::-1].index(li) + 1).rjust(
                            3, '0') + " " + chapter
                chapter_url = pyquery.PyQuery(li)('a').attr('href')
                if Comic.current_host_key == 'dongmanmanhua':
                    chapter = chapter.split('・')[0]
                detail_dict = {
                    'chapter':
                    Comic.exclude_character(chapter),
                    'chapter_url':
                    host_url + chapter_url.lstrip('/')
                    if host_key not in chapter_url else chapter_url,
                    'comic_title':
                    Comic.exclude_character(comic_title),
                }
                Comic.detail_dicts.append(detail_dict)

        detail_one_page(url)
        # 处理特殊情况 pyquery 好像不支持nth-child(n+3)这种类型过滤
        if Comic.current_host_key == 'hmba':
            Comic.detail_dicts = Comic.detail_dicts[9:]
        if Comic.current_host_key == 'dongmanmanhua':
            total_pages = len(pq('.paginate a'))
            for i in range(2, total_pages + 1):
                detail_one_page(url + f'&page={i}')
            Comic.detail_dicts.reverse()
        if Comic.current_host_key == '18comic2':
            try:
                if len(Comic.detail_dicts[0]['chapter']) > len(
                        Comic.detail_dicts[1]['chapter']):
                    Comic.detail_dicts[0]['chapter'] = Comic.detail_dicts[0][
                        'chapter'].replace(
                            Comic.detail_dicts[0]['comic_title'], '1 ')
                    Comic.detail_dicts[-1]['chapter'] = Comic.detail_dicts[-1][
                        'chapter'].replace('最新', '')
            except IndexError:
                print('短篇漫画')
            return Comic.detail_dicts
        if Comic.current_host_key in ('733', 'pufei8', 'taduo', 'cocomanhua'):
            return Comic.detail_dicts[::-1]
        return Comic.detail_dicts