Python findallの例、ree.findall Pythonの例

コード例 #1

0

ファイルを表示

ファイル: afreeca_downloader.py プロジェクト: 4e55/Hitomi-Downloader

def get_video(url, session):
    while url.strip().endswith('/'):
        url = url[:-1]

    html = downloader.read_html(url, session=session)
    soup = Soup(html)
    url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
    params = re.findall('VodParameter *= *[\'"]([^\'"]+)[\'"]', html)[0]
    params += '&adultView=ADULT_VIEW&_={}'.format(int(time() * 1000))
    url_xml = 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php?' + params
    print(url_xml)
    html = downloader.read_html(url_xml, session=session, referer=url)
    soup = Soup(html)
    title = soup.find('title').string.strip()
    urls_m3u8 = re.findall('https?://[^>]+playlist.m3u8', html)
    if not urls_m3u8:
        raise Exception('no m3u8')
    streams = []
    for url_m3u8 in urls_m3u8:
        try:
            stream = _get_stream(url_m3u8)
        except Exception as e:
            print(e)
            continue  #2193
        streams.append(stream)
    for stream in streams[1:]:
        streams[0] += stream
    stream = streams[0]
    id = url.split('/')[(-1)].split('?')[0].split('#')[0]
    video = Video(stream, url, id, title, url_thumb)
    return video

コード例 #2

0

ファイルを表示

def get_imgs(url, title=None, customWidget=None):
    username = re.findall('/member/([^/]+)', url)[0]

    url = 'https://worldcosplay.net/member/{}'.format(username)
    html = downloader.read_html(url)
    soup = Soup(html)
    userid = re.find('"member_id" *: *([0-9]+)', html)
    if userid is None:
        raise Exception('no userid')
    print('userid:', userid)

    p = 1
    imgs = []
    while True:
        url = 'http://worldcosplay.net/en/api/member/photos?member_id={}&page={}&limit=100000&rows=16&p3_photo_list=1'.format(
            userid, p)

        html = downloader.read_html(url)
        j = json.loads(html)
        list = j['list']

        print(len(list))
        if not list:
            break

        for img in list:
            photo = img['photo']
            id = photo['id']
            url_img = photo['sq300_url']
            sizes = re.findall('/max-([0-9]+)/', url_img)
            if sizes:
                size = sizes[0]
            else:
                size = 3000
            url_img = url_img.replace('-350x600', '-{}'.format(size))
            img = Image(url_img, id)
            imgs.append(img)

        p += 1

        if customWidget is not None:
            if not customWidget.alive:
                break
            customWidget.exec_queue.put(
                (customWidget, u"customWidget.setTitle(u'{}  {} - {}')".format(
                    tr_(u'읽는 중...'), title, len(imgs))))

    return imgs

コード例 #3

0

ファイルを表示

def get_id(url):
    url = url.lower()
    if '/prof-video-click/upload/' in url:
        return url.split('/prof-video-click/upload/')[1].split('/')[1]
    return re.findall(
        '[0-9]+',
        url.split('xvideos.')[1].split('/')[1].split('?')[0].split('#')[0])[0]

コード例 #4

0

ファイルを表示

ファイル: navertoon_downloader.py プロジェクト: fakegit/Hitomi-Downloader-issues

def get_pages(url, cw=None):
    print_ = get_print(cw)
    url = get_main(url).replace('comic.naver.', 'm.comic.naver.')
    id = get_id(url)
    print('id:', id)
    print(url)
    html = downloader.read_html(url)
    soup = Soup(html)
    try:
        info = soup.find('div', class_='area_info')
        artist = info.find('span', class_='author').text.strip()
    except Exception as e:
        print(e)
        try:
            title = ('\n').join(
                soup.find(
                    'div',
                    class_='title').text.strip().split('\n')[:-1]).strip()
        except:
            title = 'artist not found'

        raise Exception(title)

    print('artist:', artist)
    title = soup.find('meta', {'property': 'og:title'}).attrs['content']
    pages = []
    nos = set()
    for p in range(1, 100):
        if p == 1:
            url_page = url
        else:
            url_page = set_page(url, p)
            html = downloader.read_html(url_page)
        print('read page:', url_page)
        soup = Soup(html)
        view = soup.findAll('ul', class_='section_episode_list')[(-1)]
        for lst in view.findAll('li'):
            url_page = urljoin(url, lst.find('a').attrs['href'])
            if 'detail.nhn' not in url_page.lower():
                continue
            print_('url_page: {}'.format(url_page))
            text = lst.find('strong',
                            class_='title').find('span',
                                                 class_='name').text.strip()
            no = int(re.findall('[?&]no=([0-9]+)', url_page)[0])
            if no in nos:
                print('duplicate no: {}'.format(no))
                continue
            nos.add(no)
            text = '{:04} - {}'.format(no, text)
            page = Page(url_page, text, p)
            pages.append(page)

        btn_next = soup.find('a', class_='btn_next')
        if btn_next is None or btn_next.attrs['href'] == '#':
            print('end of page')
            break

    info = Info(id, title, artist)
    return (info, pages)

コード例 #5

0

ファイルを表示

def get_id(url, cw=None):
    for try_ in range(2):
        try:
            res = clf2.solve(url, cw=cw, f=_get_page_id)
            html = res['html']
            soup = Soup(html)
            if soup.find('div', class_='gn_login'):
                raise errors.LoginRequired()
            oid = _get_page_id(html)
            if not oid:
                raise Exception('no page_id')
            uids = re.findall(r'uid=([0-9]+)', html)
            uid = max(set(uids), key=uids.count)
            name = re.find(r"CONFIG\['onick'\]='(.+?)'", html) or soup.find(
                'div',
                class_=lambda c: c and c.startswith('ProfileHeader_name'
                                                    )).text.strip()
            if not name:
                raise Exception('no name')
            break
        except errors.LoginRequired as e:
            raise
        except Exception as e:
            e_ = e
            print(e)
    else:
        raise e_
    return uid, oid, name

コード例 #6

0

ファイルを表示

def get_video(url, session, cw):
    print_ = get_print(cw)
    html = downloader.read_html(url, session=session)
    if "document.location.href='https://login." in html:
        raise errors.LoginRequired()
    soup = Soup(html)
    url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
    print_('url_thumb: {}'.format(url_thumb))
    params = re.find('VodParameter *= *[\'"]([^\'"]+)[\'"]', html, err='No VodParameter')
    params += '&adultView=ADULT_VIEW&_={}'.format(int(time()*1000))
    url_xml = 'http://stbbs.afreecatv.com:8080/api/video/get_video_info.php?' + params
    print(url_xml)
    html = downloader.read_html(url_xml, session=session, referer=url)
    soup = Soup(html)
    if '<flag>PARTIAL_ADULT</flag>' in html:
        raise errors.LoginRequired()
    title = soup.find('title').string.strip()
    urls_m3u8 = re.findall('https?://[^>]+playlist.m3u8', html)
    if not urls_m3u8:
        raise Exception('no m3u8')
    streams = []
    for url_m3u8 in urls_m3u8:
        try:
            stream = _get_stream(url_m3u8)
        except Exception as e:
            print(e)
            continue #2193
        streams.append(stream)
    for stream in streams[1:]:
        streams[0] += stream
    stream = streams[0]
    id = url.split('/')[(-1)].split('?')[0].split('#')[0]
    video = Video(stream, url, id, title, url_thumb)
    return video

コード例 #7

0

ファイルを表示

ファイル: pixiv_downloader.py プロジェクト: vicobill/Hitomi-Downloader

 def __init__(self, illust, url, ugoira_data=None, format_name=None):
     self.illust = illust
     self.id = illust.id
     self.type = illust.type
     self.title = illust.title
     self.artist = illust.user.name
     self.url = LazyUrl('https://app-api.pixiv.net/', lambda _: url, self)
     ps = re.findall('_p([0-9]+)', url)
     p = ps[(-1)] if ps else 0
     self.p = p
     self.ext = os.path.splitext(url.split('?')[0].split('#')[0])[1]
     if self.type == 'ugoira':
         self.ugoira_data = ugoira_data
     if format_name:
         name = format_name.replace('id', '###id*').replace(
             'page', '###page*').replace('artist', '###artist*').replace(
                 'title', '###title*')
         name = name.replace('###id*', str(self.id)).replace(
             '###page*', str(self.p)).replace('###artist*',
                                              self.artist).replace(
                                                  '###title*', self.title)
         self.filename = clean_title(
             name.strip(), allow_dot=True, n=-len(self.ext)) + self.ext
     else:
         self.filename = os.path.basename(url.split('?')[0].split('#')[0])
     self.utime = get_time(illust)

コード例 #8

0

ファイルを表示

ファイル: syosetu_downloader.py プロジェクト: zzanggu/Hitomi-Downloader

 def id_(self):
     ids = re.findall('.com/([^/]+)', self.url)
     if ids:
         id = ids[0]
     else:
         id = self.url
     return id

コード例 #9

0

ファイルを表示

 def __init__(self, title, url):
     if title.startswith('NEW'):
         title = title.replace('NEW', '', 1).strip()
     title = fix_title_page(title)
     self.title = clean_title(title)
     self.url = url
     self.id = int(re.findall('wr_id=([0-9]+)', url)[0])

コード例 #10

0

ファイルを表示

 def fix_url(cls, url):
     url = url.replace('weibo.cn', 'weibo.com').split('?')[0]
     if 'weibo.com/p/' in url:
         id = re.findall('weibo.com/p/([^/]+)', url)[0]
         url = 'https://weibo.com/p/{}'.format(id)
     elif 'weibo.com/u/' in url:
         id = re.findall('weibo.com/u/([^/]+)', url)[0]
         url = 'https://weibo.com/u/{}'.format(id)
     elif 'weibo.com/' in url:
         id = re.findall('weibo.com/([^/]+)', url)[0]
         url = 'https://weibo.com/{}'.format(id)
     else:
         id = url
         url = 'https://weibo.com/u/{}'.format(id)
     url = fix_protocol(url)
     return url

コード例 #11

0

ファイルを表示

 def __init__(self, url, cw=None):
     self.cw = cw
     self.url = re.findall(r'archive.[^/]+/(?:cdx/search/cdx\?url=|(?:web/)?(?:[^/]+/))(.+)', url.lower())[0].strip('/')
     self.base_url = self.url.split('&')[0].strip('/')
     self.md5 = md5(self.url.encode('utf8')).hexdigest()[:8]
     self.mode = self.__get_mode()
     self.title = self.__get_title()

コード例 #12

0

ファイルを表示

def get_imgs(page, cw=None):
    print_ = get_print(cw)
    html = downloader.read_html(page.url)
    soup = Soup(html)

    type_ = re.find('''webtoonType *: *['"](.+?)['"]''', html)
    print_('type: {}'.format(type_))

    imgs = []
    if type_ == 'DEFAULT':  # https://m.comic.naver.com/webtoon/detail.nhn?titleId=715772
        view = soup.find('div', class_='toon_view_lst')
        for img in view.findAll('img'):
            img = img.attrs.get('data-src')
            if not img:
                continue
            img = urljoin(page.url, img)
            img = Image(img, page, len(imgs))
            imgs.append(img)
    elif type_ == 'CUTTOON':  # https://m.comic.naver.com/webtoon/detail.nhn?titleId=752803
        view = soup.find('div', class_='swiper-wrapper')
        for div in view.findAll('div', class_='swiper-slide'):
            if div.parent != view:
                continue
            if div.find('div', class_='cut_viewer_last'):
                print('cut_viewer_last')
                continue
            if div.find('div', class_='cut_viewer_recomm'):
                print('cut_viewer_recomm')
                continue
            img = div.find('img')
            img = img.attrs['data-src']
            img = urljoin(page.url, img)
            img = Image(img, page, len(imgs))
            imgs.append(img)
    elif type_ == 'EFFECTTOON':  #2313; https://m.comic.naver.com/webtoon/detail.nhn?titleId=670144
        img_base = re.find('''imageUrl *: *['"](.+?)['"]''', html) + '/'
        print('img_base:', img_base)
        url_api = re.find('''documentUrl *: *['"](.+?)['"]''', html)
        data_raw = downloader.read_html(url_api, page.url)
        data = json.loads(data_raw)
        for img in data['assets']['stillcut'].values(
        ):  # ordered in python3.7+
            img = urljoin(img_base, img)
            img = Image(img, page, len(imgs))
            imgs.append(img)
    else:
        _imgs = re.findall('sImageUrl *: *[\'"](.+?)[\'"]', html)
        if not _imgs:
            raise Exception('no imgs')
        for img in _imgs:
            img = urljoin(page.url, img)
            img = Image(img, page, len(imgs))
            imgs.append(img)

    return imgs

コード例 #13

0

ファイルを表示

 def following(self, p, r18=False):
     url = 'https://www.pixiv.net/bookmark_new_illust_r18.php' if r18 else 'https://www.pixiv.net/bookmark_new_illust.php'
     if p > 1:
         url += '?p={}'.format(p)
     html = downloader.read_html(url, session=self.session)
     ids = []
     ids_set = set()
     for id_ in re.findall('([0-9]+)_p0_master1200', html):
         if id_ in ids_set:
             continue
         ids_set.add(id_)
         ids.append(id_)
     return ids

コード例 #14

0

ファイルを表示

def get_page(url):
    qs = query_url(url)
    page = qs.get('p')
    if page:
        page = int(page[0])
    else:
        page = re.findall('_p([0-9]+)', url)
        if page:
            page = int(page[0])
        else:
            page = None
    if page == 1:
        page = None
    return page

コード例 #15

0

ファイルを表示

def setPage(url, page):
    # Always use HTTPS
    url = url.replace('http://', 'https://')

    # Main page
    if re.findall(r'https://[\w]*[.]?donmai.us/?$', url):
        url = 'https://{}donmai.us/posts?page=1'.format('danbooru.' if 'danbooru.' in url else '')

    # Change the page
    if 'page=' in url:
        url = re.sub('page=[0-9]*', 'page={}'.format(page), url)
    else:
        url += '&page={}'.format(page)
        
    return url

コード例 #16

0

ファイルを表示

def get_id(url, cw=None):
    for try_ in range(2):
        try:
            res = clf2.solve(url, cw=cw, f=_get_page_id)
            html = res['html']
            soup = Soup(html)
            if soup.find('div', class_='gn_login'):
                raise errors.LoginRequired()
            m = _get_page_id(html)
            if not m:
                raise Exception('no page_id')
            oid = m.groups()[0]
            uids = re.findall('uid=([0-9]+)', html)
            uid = max(set(uids), key=uids.count)
            name = re.findall("CONFIG\\['onick'\\]='(.+?)'", html)[0]
            break
        except errors.LoginRequired as e:
            raise
        except Exception as e:
            e_ = e
            print(e)
    else:
        raise e_
    return uid, oid, name

コード例 #17

0

ファイルを表示

    def __init__(self, url):
        if 'fbid=' in url:
            id = int(re.findall('fbid=([0-9]+)', url)[0])
        elif 'photos/' in url:
            id = int(url.split('photos/')[1].split('/')[1])
        else:
            id = int(url)
        self.id = id

        def f(_):
            img = get_img(url)
            ext = os.path.splitext(img.split('?')[0])[1]
            self.filename = u'{}{}'.format(id, ext)
            return img

        self.url = LazyUrl(url, f, self)

コード例 #18

0

ファイルを表示

ファイル: syosetu_downloader.py プロジェクト: zzanggu/Hitomi-Downloader

    def __init__(self, title, update, url, session, single):
        if single:
            self.p = None
            self.title = title
        else:
            self.p = int(re.findall('/([0-9]+)', url)[(-1)])
            title = (u'[{:04}] {}').format(self.p, title)
            title = clean_title(title, n=-4)
            self.title = title
        self.filename = (u'{}.txt').format(self.title)

        def f(url):
            text = get_text(url, self.title, update, session)
            f = BytesIO()
            f.write(text.encode('utf8'))
            f.seek(0)
            return f

        self.url = LazyUrl(url, f, self)

コード例 #19

0

ファイルを表示

    def get(self, url):
        '''
        get
        '''
        cw = self.cw
        session = self.session
        print_ = get_print(cw)
        if self._url:
            return self._url

        id_ = re.find(r'viewkey=(\w+)', url, re.IGNORECASE) or \
              re.find(r'/embed/(\w+)', url, re.IGNORECASE, err='no id')
        print_('id: {}'.format(id_))
        if 'viewkey=' not in url.lower() and '/gif/' not in url.lower():
            url = urljoin(url, '/view_video.php?viewkey={}'.format(id_))

        url_test = url.replace('pornhubpremium.com', 'pornhub.com')
        try:
            html = downloader.read_html(url_test, session=session)
            soup = Soup(html)
            if soup.find('div', id='lockedPlayer'):
                print_('Locked player')
                raise Exception('Locked player')
            url = url_test
        except: #3511
            url = url.replace('pornhub.com', 'pornhubpremium.com')
            html = downloader.read_html(url, session=session)
            
        soup = Soup(html)
        soup = fix_soup(soup, url, session, cw)
        html = str(soup)

        # removed
        if soup.find('div', class_='removed'):
            raise Exception('removed')

        gif = soup.find('div', {'id': 'gifImageSection'})
        if gif:
            print_('GIF')
            id_ = url.split('/gif/')[1]
            id_ = re.findall('[0-9a-zA-Z]+', id_)[0]
            
            jss = list(gif.children)
            for js in jss:
                if 'data-mp4' in getattr(js, 'attrs', {}):
                    break
            else:
                raise Exception('gif mp4 url not found')

            title = js['data-gif-title']
            url = js['data-mp4']
            url_thumb = re.find(r'https?://.+?.phncdn.com/pics/gifs/.+?\.jpg', html, err='no thumb')
            file = File('gif_{}'.format(id_), title, url, url_thumb)
        else:
            if id_ is None:
                raise Exception('no id')

            print_('Video')

            # 1968
            #title = j['video_title']
            title = soup.find('h1', class_='title').text.strip()

            ydl = ytdl.YoutubeDL(cw=self.cw)
            info = ydl.extract_info(url)
            url_thumb = info['thumbnail']
            videos = []
            for f in info['formats']:
                video = {}
                video['height'] = f['height']
                video['quality'] = f['height'] or 0
                video['protocol'] = f['protocol']
                video['videoUrl'] = f['url']
                if f['protocol'] == 'm3u8':
                    video['quality'] -= 1
                print_('[{}p] {} {}'.format(video['height'], video['protocol'], video['videoUrl']))
                videos.append(video)

            if not videos:
                raise Exception('No videos')

            videos = sorted(videos, key=lambda video: video['quality'])

            res = get_resolution()

            videos_good = [video for video in videos if video['quality'] <= res]
            if videos_good:
                video = videos_good[-1]
            else:
                video = videos[0]
            print_('\n[{}p] {} {}'.format(video['height'], video['protocol'], video['videoUrl']))

            file = File(id_, title, video['videoUrl'].strip(), url_thumb)
        
        self._url = file.url
        self.title = file.title
        self.filename = file.filename
        self.thumb = file.thumb
        return self._url

コード例 #20

0

ファイルを表示

 def album(self):
     if 'album_id=' in self.url:
         album = re.findall('album_id=([0-9]+)', self.url)[0]
     else:
         album = None
     return album

コード例 #21

0

ファイルを表示

ファイル: sankaku_downloader.py プロジェクト: yobailover/Hitomi-Downloader

def get_imgs(url, title=None, customWidget=None, d=None, types=['img', 'gif', 'video'], session=None):
    if False:#
        raise NotImplementedError('Not Implemented')
    print_ = get_print(customWidget)
    print_(u'types: {}'.format(', '.join(types)))
    
    # Range
    max_pid = get_max_range(customWidget, 2000)

    local_ids = {}
    if customWidget is not None:
        dir = customWidget.downloader.dir
        try:
            names = os.listdir(dir)
        except Exception as e:
            print(e)
            names = []
        for name in names:
            id = os.path.splitext(name)[0]
            local_ids[id] = os.path.join(dir, name)
        
    imgs = []
    page = 1
    url_imgs = set()
    if 'chan.sankakucomplex' in url:
        type = 'chan'
    elif 'idol.sankakucomplex' in url:
        type = 'idol'
    else:
        raise Exception('Not supported subdomain')
    url_old = 'https://{}.sankakucomplex.com'.format(type)
    if customWidget is not None:
        customWidget.exec_queue.put((customWidget, u"customWidget.setTitle(u'{}  {}')".format(tr_(u'읽는 중...'), title)))
    while len(imgs) < max_pid:
        #if page > 25: # Anonymous users can only view 25 pages of results
        #    break
        sleep(1)#
        #url = setPage(url, page)
        print_(url)
        html = downloader.read_html(url, referer=url_old, session=session)
        if '429 Too many requests'.lower() in html.lower():
            print_('429 Too many requests... wait 120 secs')
            for i in range(120):
                sleep(1)
                if customWidget and not customWidget.alive:
                    return []
            continue
        page += 1
        url_old = url
        soup = Soup(html)
        articles = soup.findAll('span', {'class': 'thumb'})
        
        if not articles:
            break
            
        for article in articles:
            # 1183
            tags = article.find('img', class_='preview').attrs['title'].split()
            if 'animated_gif' in tags:
                type_ = 'gif'
            elif 'animated' in tags or 'webm' in tags or 'video' in tags or 'mp4' in tags: # 1697
                type_ = 'video'
            else:
                type_ = 'img'
            if type_ not in types:
                continue
            
            url_img = article.a.attrs['href']
            if not url_img.startswith('http'):
                url_img = urljoin('https://{}.sankakucomplex.com'.format(type), url_img)
            id = re.findall('show/([0-9]+)', url_img)[0]
            if id in local_ids:
                #print('skip', id)
                local = True
            else:
                local = False
            #print(url_img)
            if url_img not in url_imgs:
                url_imgs.add(url_img)
                if local:
                    url_img = local_ids[id]
                img = Image(type, id, url_img, url, local=local, cw=customWidget, d=d)
                imgs.append(img)
                if len(imgs) >= max_pid:
                    break
        if customWidget and not customWidget.alive:
            break

        try:
            # For page > 50
            pagination = soup.find('div', class_='pagination')
            url = urljoin('https://{}.sankakucomplex.com'.format(type), pagination.attrs['next-page-url'])
        except Exception as e:
            print_(print_error(e)[-1])
            #url = setPage(url, page)
            break
        
        if customWidget is not None:
            customWidget.setTitle(u'{}  {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)))
        else:
            print(len(imgs), 'imgs')

    if not imgs:
        raise Exception('no images')
    
    return imgs

コード例 #22

0

ファイルを表示

def find_url(html, url):
    href = re.findall('[\'"]([^\'"]+page.php[^\'"]+manga_detail[^\'"]+)[\'"]',
                      html)[0]
    href = html_unescape(href)
    return urljoin(url, href)

コード例 #23

0

ファイルを表示

def get_imgs_page_legacy(page, session, cw=None, depth=0):
    if cw is not None and not cw.alive:
        return
    print_ = get_print(cw)

    try:
        html = read_html(page.url, session)
    except Exception as e:
        print_('get_imgs_page_legacy error: {}'.format(e))
        if e.args and e.args[0] == 502:
            return []
        raise

    if isProtected(html):
        data = get_soup(page.url, cw=cw, session=session)
        page.url = data['url']
        html = data['html']

    soup = Soup(html, 'html5lib')  # 1653

    # skip empty pages
    if not html:
        print_(u'empty page: {}'.format(page.title))
        return []

    # skip invalid pages
    err = soup.find('span', class_='cf-error-code')
    if err:
        print_(u'cf-error-code: {} ({})'.format(err.text.strip(), page.title))
        if depth > 0:
            return []
        else:
            return get_imgs_page_legacy(page, session, cw, depth + 1)

    #page.title = get_title_page(soup)
    matches = re.findall('var img_list *= *(.+?]);', html.replace('\n', ''))
    matches1 = re.findall('var img_list1 *= *(.+?]);', html.replace('\n', ''))
    img_list = json.loads(matches[0]) if matches else []
    img_list1 = json.loads(matches1[0]) if matches1 else []

    # 1780
    img_list = [img for img in img_list if img]
    img_list1 = [img for img in img_list1 if img]

    # 1589
    '''
    if not img_list and not img_list1:
        print_((u'no imgs; retry... {}').format(page.title))
        raise Exception('No images')
    '''

    for script in soup.findAll('script'):
        script = script.text
        if 'var img_list =' in script:
            break
    else:
        raise Exception('No script')

    seed = int(re.find('view_cnt *= *([0-9]+)', script))
    chapter = int(re.find('var +chapter *= *([0-9]+)', script))
    try:
        cdn_domains = cut_pair(re.find('var +cdn_domains *= *(.+)', script),
                               '[]')
        cdn_domains = json.loads(cdn_domains)
    except Exception as e:
        print(e)
        cdn_domains = []

    n = max(len(img_list), len(img_list1))
    img_list += [''] * (n - len(img_list))
    img_list1 += [''] * (n - len(img_list1))

    print_(u'{}    chapter:{}    seed:{}    domains:{}'.format(
        page.title, chapter, seed, len(cdn_domains)))
    if seed != 0:
        return 'seed'
    imgs = []
    for p, (img, img1) in enumerate(zip(img_list, img_list1)):

        # fix img url
        img = fix_img_url(img, cdn_domains, chapter, p)
        img1 = fix_img_url(img1, cdn_domains, chapter, p)

        img = urljoin(page.url, img) if img else ''
        img1 = urljoin(page.url,
                       img1) if img1 else ''  # most likely googledrive
        if img.strip('/').count('/') == 2:  #1425
            continue
        img = Image(img, page, p, img1)
        imgs.append(img)

    return imgs

コード例 #24

0

ファイルを表示

    def get(self, url):
        '''
        get
        '''
        cw = self.cw
        session = self.session
        print_ = get_print(cw)
        if self._url:
            return self._url

        id_ = re.find(r'viewkey=(\w+)', url, re.IGNORECASE) or \
              re.find(r'/embed/(\w+)', url, re.IGNORECASE, err='no id')
        print_('id: {}'.format(id_))
        if 'viewkey=' not in url.lower() and '/gif/' not in url.lower():
            url = urljoin(url, '/view_video.php?viewkey={}'.format(id_))

        url_test = url.replace('pornhubpremium.com', 'pornhub.com')
        try:
            html = downloader.read_html(url_test, session=session)
            soup = Soup(html)
            if soup.find('div', id='lockedPlayer'):
                print_('Locked player')
                raise Exception('Locked player')
            url = url_test
        except: #3511
            url = url.replace('pornhub.com', 'pornhubpremium.com')
            html = downloader.read_html(url, session=session)
            
        soup = Soup(html)
        soup = fix_soup(soup, url, session, cw)
        html = soup.html

        # removed
        if soup.find('div', class_='removed'):
            raise Exception('removed')

        gif = soup.find('div', {'id': 'gifImageSection'})
        if gif:
            print_('GIF')
            id_ = url.split('/gif/')[1]
            id_ = re.findall('[0-9a-zA-Z]+', id_)[0]
            
            jss = list(gif.children)
            for js in jss:
                if 'data-mp4' in getattr(js, 'attrs', {}):
                    break
            else:
                raise Exception('gif mp4 url not found')

            title = js['data-gif-title']
            url = js['data-mp4']
            url_thumb = re.find(r'https?://.+?.phncdn.com/pics/gifs/.+?\.jpg', html, err='no thumb')
            file = File('gif_{}'.format(id_), title, url, url_thumb)
        else:
            if id_ is None:
                raise Exception('no id')

            print_('Video')

            # 1968
            #title = j['video_title']
            title = soup.find('h1', class_='title').text.strip()

            video_urls = []
            video_urls_set = set()

            def int_or_none(s):
                try:
                    return int(s)
                except:
                    return None

            def url_or_none(url):
                if not url or not isinstance(url, str):
                    return None
                url = url.strip()
                return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
            
            flashvars  = json.loads(re.find(r'var\s+flashvars_\d+\s*=\s*({.+?});', html, err='no flashvars'))
            url_thumb = flashvars.get('image_url')
            media_definitions = flashvars.get('mediaDefinitions')
            if isinstance(media_definitions, list):
                for definition in media_definitions:
                    if not isinstance(definition, dict):
                        continue
                    video_url = definition.get('videoUrl')
                    if not video_url or not isinstance(video_url, str):
                        continue
                    if video_url in video_urls_set:
                        continue
                    video_urls_set.add(video_url)
                    video_urls.append(
                        (video_url, int_or_none(definition.get('quality'))))

            def extract_js_vars(webpage, pattern, default=object()):
                assignments = re.find(pattern, webpage, default=default)
                if not assignments:
                    return {}

                assignments = assignments.split(';')

                js_vars = {}

                def remove_quotes(s):
                    if s is None or len(s) < 2:
                        return s
                    for quote in ('"', "'", ):
                        if s[0] == quote and s[-1] == quote:
                            return s[1:-1]
                    return s

                def parse_js_value(inp):
                    inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
                    if '+' in inp:
                        inps = inp.split('+')
                        return functools.reduce(
                            operator.concat, map(parse_js_value, inps))
                    inp = inp.strip()
                    if inp in js_vars:
                        return js_vars[inp]
                    return remove_quotes(inp)

                for assn in assignments:
                    assn = assn.strip()
                    if not assn:
                        continue
                    assn = re.sub(r'var\s+', '', assn)
                    vname, value = assn.split('=', 1)
                    js_vars[vname] = parse_js_value(value)
                return js_vars

            def add_video_url(video_url):
                v_url = url_or_none(video_url)
                if not v_url:
                    return
                if v_url in video_urls_set:
                    return
                video_urls.append((v_url, None))
                video_urls_set.add(v_url)

            def parse_quality_items(quality_items):
                q_items = json.loads(quality_items)
                if not isinstance(q_items, list):
                    return
                for item in q_items:
                    if isinstance(item, dict):
                        add_video_url(item.get('url'))

            if not video_urls:
                print_('# extract video_urls 2')
                FORMAT_PREFIXES = ('media', 'quality', 'qualityItems')
                js_vars = extract_js_vars(
                    html, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES),
                    default=None)
                if js_vars:
                    for key, format_url in js_vars.items():
                        if key.startswith(FORMAT_PREFIXES[-1]):
                            parse_quality_items(format_url)
                        elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]):
                            add_video_url(format_url)
                if not video_urls and re.search(
                        r'<[^>]+\bid=["\']lockedPlayer', html):
                    raise Exception('Video is locked')

##            if not video_urls:
##                print_('# extract video_urls 3')
##                js_vars = extract_js_vars(
##                    dl_webpage('tv'), r'(var.+?mediastring.+?)</script>')
##                add_video_url(js_vars['mediastring'])

            for mobj in re.finditer(
                    r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
                    html):
                video_url = mobj.group('url')
                if video_url not in video_urls_set:
                    video_urls.append((video_url, None))
                    video_urls_set.add(video_url)

            video_urls_ = video_urls
            video_urls = []
            for video_url, height in video_urls_:
                if '/video/get_media' in video_url:
                    print_(video_url)
                    medias = downloader.read_json(video_url, session=session)
                    if isinstance(medias, list):
                        for media in medias:
                            if not isinstance(media, dict):
                                continue
                            video_url = url_or_none(media.get('videoUrl'))
                            if not video_url:
                                continue
                            height = int_or_none(media.get('quality'))
                            video_urls.append((video_url, height))
                    continue
                video_urls.append((video_url, height))
                

            videos = []
            for video_url, height in video_urls:
                video = {}
                video['height'] = height or int_or_none(re.find(r'(?P<height>\d+)[pP]?_\d+[kK]', video_url))
                video['quality'] = video['height'] or 0
                video['videoUrl'] = video_url
                ext = get_ext(video_url)
                video['ext'] = ext
                if ext.lower() == '.m3u8':
                    video['quality'] -= 1
                print_('[{}p] {} {}'.format(video['height'], video['ext'], video['videoUrl']))
                videos.append(video)

            if not videos:
                raise Exception('No videos')

            videos = sorted(videos, key=lambda video: video['quality'])

            res = get_resolution()

            videos_good = [video for video in videos if video['quality'] <= res]
            if videos_good:
                video = videos_good[-1]
            else:
                video = videos[0]
            print_('\n[{}p] {} {}'.format(video['height'], video['ext'], video['videoUrl']))

            file = File(id_, title, video['videoUrl'].strip(), url_thumb)
        
        self._url = file.url
        self.title = file.title
        self.filename = file.filename
        self.thumb = file.thumb
        return self._url

コード例 #25

0

ファイルを表示

ファイル: pornhub_downloader.py プロジェクト: syrux77/Hitomi-Downloader

    def get(self, url):
        '''
        get
        '''
        cw = self.cw
        session = self.session
        print_ = get_print(cw)
        if self._url:
            return self._url

        id_ = re.find(r'viewkey=(\w+)', url, re.IGNORECASE) or \
              re.find(r'/embed/(\w+)', url, re.IGNORECASE)
        print('id: {}'.format(id_))
        if 'viewkey=' not in url.lower() and '/gif/' not in url.lower():
            url = urljoin(url, '/view_video.php?viewkey={}'.format(id_))
        html = downloader.read_html(url, session=session)

        soup = Soup(html)
        soup = fix_soup(soup, url, session, cw)
        html = str(soup)

        # removed
        if soup.find('div', class_='removed'):
            raise Exception('removed')

        gif = soup.find('div', {'id': 'gifImageSection'})
        if gif:
            print_('GIF')
            id_ = url.split('/gif/')[1]
            id_ = re.findall('[0-9a-zA-Z]+', id_)[0]

            jss = list(gif.children)
            for js in jss:
                if 'data-mp4' in getattr(js, 'attrs', {}):
                    break
            else:
                raise Exception('gif mp4 url not found')

            title = js['data-gif-title']
            url = js['data-mp4']
            url_thumb = re.find(r'https?://.+?.phncdn.com/pics/gifs/.+?\.jpg',
                                html,
                                err='no thumb')
            file = File('gif_{}'.format(id_), title, url, url_thumb)
        else:
            if id_ is None:
                raise Exception('no id')

            print_('Video')
            j = decode(html, cw)

            # 1968
            #title = j['video_title']
            title = soup.find('h1', class_='title').text.strip()

            url_thumb = j['image_url']
            videos = []
            for video in j['mediaDefinitions']:
                url_ = video.get('videoUrl').strip()
                ext = get_ext(url_)
                if ext.lower() not in ['.mp4', '.m3u8']:
                    print('not mp4: {}'.format(ext))
                    continue
                quality = video.get('quality', 0)
                if isinstance(quality, list):
                    quality = quality[0]
                video['quality'] = int(quality)
                print_('[{}p] {}'.format(quality, url_))
                videos.append(video)

            if not videos:
                raise Exception('No videos')

            videos = sorted(videos, key=lambda video: video['quality'])

            res = get_resolution()

            videos_good = [
                video for video in videos if video['quality'] <= res
            ]
            if videos_good:
                video = videos_good[-1]
            else:
                video = videos[0]
            print_('\n[{}p] {}'.format(video['quality'], video['videoUrl']))

            file = File(id_, title, video['videoUrl'].strip(), url_thumb)

        self._url = file.url
        self.title = file.title
        self.filename = file.filename
        self.thumb = file.thumb
        return self._url

コード例 #26

0

ファイルを表示

def get_videos(url, cw=None, depth=0):
    print_ = get_print(cw)
    if utils.ui_setting:
        res_text = compatstr(utils.ui_setting.youtubeCombo_res.currentText())
        res = {
            '720p': 720,
            '1080p': 1080,
            '2K': 1440,
            '4K': 2160,
            '8K': 4320
        }[res_text]
    else:
        res = 720

    mobj = re.match(_VALID_URL, url)
    video_id = mobj.group('id')
    anime_id = mobj.group('anime_id')
    print(video_id, anime_id)
    print_ = get_print(cw)
    html = downloader.read_html(url, methods={'requests'})
    soup = Soup(html)
    title = soup.find('h1').attrs['title'].strip()
    url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
    p = get_page(url)
    if p is None:
        p = 1
    print('page:', p)
    if p > 1:
        pages = get_pages(html)
        cid = pages[(p - 1)]['cid']
    else:
        cid = re.findall('\\bcid(?:["\\\']:|=)(\\d+)', html)[0]
    print_('cid: {}'.format(cid))
    headers = {'Referer': url}
    entries = []

    RENDITIONS = [
        'qn={}&quality={}&type='.format(qlt, qlt) for qlt in RESOLS.keys()
    ]  # + ['quality=2&type=mp4']

    for num, rendition in enumerate(RENDITIONS, start=1):
        print('####', num, rendition)
        payload = 'appkey=%s&cid=%s&otype=json&%s' % (_APP_KEY, cid, rendition)
        sign = hashlib.md5(
            (payload + _BILIBILI_KEY).encode('utf-8')).hexdigest()
        url_json = 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (
            payload, sign)
        s_json = downloader.read_html(url_json)
        print(s_json[:1000])
        video_info = json.loads(s_json)
        if not video_info:
            continue
        if 'durl' not in video_info:
            print('#### error', num)
            if num < len(RENDITIONS):
                continue
            msg = video_info.get('message')
            if msg:
                raise Exception(msg)
        quality = video_info['quality']
        resolution = get_resolution(quality)
        s = (u'resolution: {}').format(resolution)
        print_(s)

        # 2184
        if int(re.find('([0-9]+)p', resolution)) > res:
            print_('skip resolution')
            continue

        for idx, durl in enumerate(video_info['durl']):
            # 1343
            if idx == 0:
                size = downloader.get_size(durl['url'], referer=url)
                if size < 1024 * 1024 and depth == 0:
                    print_('size is too small')
                    return get_videos(url, cw, depth + 1)

            formats = [{
                'url': durl['url'],
                'filesize': int_or_none(durl['size'])
            }]
            for backup_url in durl.get('backup_url', []):
                formats.append({
                    'url':
                    backup_url,
                    'preference':
                    -2 if 'hd.mp4' in backup_url else -3
                })

            for a_format in formats:
                a_format.setdefault('http_headers',
                                    {}).update({'Referer': url})

            entries.append({
                'id': '%s_part%s' % (video_id, idx),
                'duration': float_or_none(durl.get('length'), 1000),
                'formats': formats
            })

        break

    videos = []
    for entry in entries:
        url_video = entry['formats'][0]['url']
        video = Video(url_video, url, video_id, len(videos))
        videos.append(video)

    info = {'title': clean_title(title), 'url_thumb': url_thumb}
    return (videos, info)

コード例 #27

0

ファイルを表示

 def twitter():
     return '@' + re.findall('twitter.[^/]+/([^/*?]+)', self.url)[0]

コード例 #28

0

ファイルを表示

ファイル: nico_downloader.py プロジェクト: syrux77/Hitomi-Downloader

def get_id(url):
    if '/watch/' in url:
        id = re.findall('/watch/([a-zA-Z0-9]+)', url)[0]
    else:
        id = url
    return id

コード例 #29

0

ファイルを表示

def get_imgs(url, filter_, directory, session=Session(), cw=None):
    print_ = get_print(cw)

    if not os.path.exists(directory):
        os.makedirs(directory)

    urls_path = os.path.join(directory, '{}.urls'.format(filter_.md5))
    bitmap_path = os.path.join(directory, '{}.bitmap'.format(filter_.md5))
    count_path = os.path.join(directory, '{}.count'.format(filter_.md5))

    for path in [urls_path, bitmap_path, count_path]:
        if not os.path.exists(path):
            open(path, 'x').close()

    with open(count_path) as file:
        num_complete = (lambda x: int(x) if x else 0)(file.read())

    snapshots = WaybackMachineAPI(session, cw).snapshots(url)
    bitmap = Bitmap(cw=cw).load(len(snapshots), bitmap_path) if num_complete else Bitmap(len(snapshots), cw=cw)

    base_url = 'https://web.archive.org/web/{}im_/{}'

    def get_imgs_snapshot(id_, snapshot):

        @sleep_and_retry
        @limits(1, 5)
        def get_soup():
            try:
                return downloader.read_soup(f'https://web.archive.org/web/{snapshot[0]}id_/{snapshot[1]}')
            except Exception as exception:
                print_(print_error(exception)[0])
                return None

        def get_imgs_soup(soup):
            if not soup:
                return []

            def default():
                return [base_url.format(snapshot[0], img['src']) for img in soup.find_all('img', src=True)]

            def twitter():
                return [base_url.format(snapshot[0], img['src']) for img in soup.find_all('img', src=True) if 'twimg.com/media/' in img['src']]

            return [
                default,
                twitter
            ][filter_.mode]()

        return id_, get_imgs_soup(get_soup())

    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(get_imgs_snapshot, id_, snapshot) for id_, snapshot in enumerate(snapshots) if not bitmap.get(id_)]

    with open(urls_path, 'a') as urls_file:
        for future in concurrent.futures.as_completed(futures):
            id_, urls = future.result()
            urls_file.writelines([f'{url}\n' for url in urls])
            bitmap.update(id_, bitmap_path)
            num_complete += 1
            with open(count_path, 'w') as count_file:
                count_file.write(str(num_complete))
            msg = f'{filter_.title} - {num_complete}'
            cw.setTitle(msg) if cw else print_(msg)

    with open(urls_path) as file:
        urls = set()
        for url in file.readlines():
            urls.update(re.findall(r'^\S+$', url))

    os.remove(urls_path)
    os.remove(bitmap_path)
    os.remove(count_path)

    return urls