def get(self, _):
        if self._url_cache:
            return self._url_cache
        print_ = get_print(self.cw)
        for try_ in range(self.try_n):
            try:
                d = ytdl.YoutubeDL()
                info = d.extract_info(self._url)

                url = info['url']
                ext = get_ext(url)
                self.ext = ext
                print_('get_video: {} {}'.format(url, ext))
                if ext.lower() == '.m3u8':
                    url = M3u8_stream(url,
                                      n_thread=self.n_thread,
                                      post_processing=True)
                self._url_cache = url
                return url
            except Exception as e:
                e_ = e
                msg = print_error(e)[(-1)]
                print_('\nTwitter video Error:\n{}'.format(msg))
                if try_ < self.try_n - 1:
                    sleep(10, self.cw)
        else:
            raise e_
    def f(html, browser=None):
        soup = Soup(html)
        if is_captcha(soup):
            print('captcha')
            browser.show()
            sd['shown'] = True
        elif sd['shown']:
            browser.hide()
            sd['shown'] = False
        try:
            info['uid'] = soup.find('h2', class_='share-title').text.strip()
            info['nickname'] = soup.find(
                'h1', class_='share-sub-title').text.strip()
        except Exception as e:
            print_(print_error(e)[0])
        c = 0
        ids_now = set()
        for div in soup.findAll('div', class_='video-feed-item'):
            a = div.find('a')
            if a is None:
                continue
            href = a['href']
            if not href:
                continue
            m = re.search(PATTERN_VID, href)
            if m is None:
                continue
            id_video = int(m.group('id'))
            ids_now.add(id_video)
            if id_video in ids:
                continue
            ids.add(id_video)
            info['items'].append({'id': id_video})
            c += 1

        print_('items: {}'.format(len(info['items'])))
        if len(info['items']) >= max_pid:
            info['items'] = info['items'][:max_pid]
            return True

        browser.runJavaScript(
            'window.scrollTo(0, document.body.scrollHeight);')
        sleep(15, cw)

        if c or (ids_now and min(ids_now) > min(ids)):
            sd['count_empty'] = 0
        else:
            print_('empty')
            sd['count_empty'] += 1
        msg = '{}  {} (tiktok_{}) - {}'.format(tr_('읽는 중...'),
                                               info.get('nickname'),
                                               info.get('uid'),
                                               len(info['items']))
        if cw:
            if not cw.alive:
                raise Exception('cw dead')
            cw.setTitle(msg)
        else:
            print(msg)
        return sd['count_empty'] > 4
Beispiel #3
0
def get_imgs_single(url,
                    session,
                    types,
                    format='[%y-%m-%d] id_ppage',
                    cw=None):
    print_ = get_print(cw)
    id = re.find('/status/([0-9]+)', url)
    if id is None:
        raise Exception('no id')

    data = TwitterAPI(session, cw).tweet(id, url)

    tweets = data["globalObjects"]["tweets"]
    id = tweets[id].get('retweeted_status_id_str') or id
    tweet = tweets[id]

    time = get_time(tweet)

    img = Image(url, url, id, time, 0, format, cw, True, try_n=1, n_thread=4)
    try:
        img.url()
        return [img]
    except Exception as e:
        print(print_error(e)[-1])
        return get_imgs_from_tweet(tweet, session, types, format, cw)
Beispiel #4
0
    def post_processing(self):
        cw = self.customWidget
        ui_setting = self.ui_setting
        format = self._format
        if cw is not None and format is not None:
            try:
                dither = ui_setting.checkDither.isChecked()
                quality = ui_setting.ugoira_quality.value()
            except Exception as e:
                print(e)
                dither = True
                quality = 90

            imgs_ugoira = []
            for img in self.imgs:
                if img.url not in cw.urls:
                    continue
                if img.type == 'ugoira':
                    if os.path.splitext(img.url)[1].lower() == '.zip':
                        imgs_ugoira.append(img)

            for j, img in enumerate(imgs_ugoira):
                if not cw.valid or not cw.alive:
                    return
                self.exec_queue.put(
                    (cw,
                     (u'customWidget.pbar.setFormat(u"[%v/%m]  {} [{}/{}]")'
                      ).format(tr_(u'움짤 변환...'), j, len(imgs_ugoira))))
                filename = os.path.join(self.dir, img.filename)
                out = os.path.splitext(filename)[0] + '.' + format
                cw.print_((u'convert ugoira: {} --> {}').format(filename, out))
                try:
                    duration = [
                        frame.delay for frame in img.ugoira_data.frames
                    ]
                    self.print_((u'Duration: {}').format(duration))
                    ffmpeg.gif(filename,
                               out,
                               duration=duration,
                               dither=dither,
                               quality=quality,
                               cw=cw)
                except Exception as e:
                    self.print_(print_error(e)[0])
                    continue
                if not cw.valid or not cw.alive:
                    return
                try:
                    self.removeDirList.append((filename, False))
                    cw.dones.add(out)
                    i = cw.urls.index(img.url)
                    cw.imgs[i] = out
                    if i == 0:
                        cw.firstImg = out
                        cw.setIcon(out)
                except Exception as e:
                    return self.Invalid(e=e)

            self.exec_queue.put(
                (cw, u'customWidget.pbar.setFormat("[%v/%m]")'))
Beispiel #5
0
def process_ids(ids, info, imgs, cw, depth=0):
    print_ = get_print(cw)
    max_pid = get_max_range(cw)
    for i, id_illust in enumerate(ids):
        try:
            info_illust = get_info(
                'https://www.pixiv.net/en/artworks/{}'.format(id_illust),
                cw,
                depth=depth + 1)
        except Exception as e:
            if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました'
                               or type(e) == errors.LoginRequired
                               ):  # logout during extraction
                raise e
            print_('process_ids error ({}):\n{}'.format(
                depth,
                print_error(e)[0]))
            continue
        imgs += info_illust['imgs']
        s = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(imgs))
        if cw:
            cw.setTitle(s)
        else:
            print(s)
        if len(imgs) >= max_pid:
            break
        if depth == 0:
            check_alive(cw)
 def get(self, url):
     cw = self.cw
     d = self.d
     print_ = get_print(cw)
     
     for try_ in range(4):
         wait(cw)
         html = ''
         try:
             html = downloader.read_html(url, referer=self.referer, session=self.session)
             #url = 'https:' + re.findall('[Oo]riginal:? ?<a href="(//[0-9a-zA-Z_-]{2,2}.sankakucomplex.com/data/.{0,320}?)"', html)[0]
             soup = Soup(html)
             highres = soup.find(id='highres')
             url = urljoin(url, highres['href'] if highres else soup.find(id='image')['src'])
             break
         except Exception as e:
             e_msg = print_error(e)[0]
             if '429 Too many requests'.lower() in html.lower():
                 t_sleep = 120 * min(try_ + 1, 2)
                 e = '429 Too many requests... wait {} secs'.format(t_sleep)
             elif 'post-content-notification' in html: # sankaku plus
                 print_('Sankaku plus: {}'.format(self.id))
                 return ''
             else:
                 t_sleep = 5
             s = '[Sankaku] failed to read image (id:{}): {}'.format(self.id, e)
             print_(s)
             sleep(t_sleep, cw)                
     else:
         raise Exception('can not find image (id:{})\n{}'.format(self.id, e_msg))
     soup = Soup('<p>{}</p>'.format(url))
     url = soup.string
     ext = os.path.splitext(url)[1].split('?')[0]
     self.filename = '{}{}'.format(self.id, ext)
     return url
Beispiel #7
0
    def _pagination(self,
                    url_api,
                    params=None,
                    entry_tweet="tweet-",
                    entry_cursor="cursor-bottom-"):
        if params is None:
            params = self.params.copy()

        while True:
            cursor = None
            if params.get("cursor"):
                self.print_('cursor: {}'.format(params.get("cursor")))

            # 2303
            n_try = RETRY_PAGINATION
            for try_ in range(n_try):
                try:
                    data = self._call(url_api, params=params)
                    if 'globalObjects' not in data:
                        try_ = n_try
                        raise Exception(str(data['errors']))
                    tweets = data["globalObjects"]["tweets"]
                    break
                except Exception as e:
                    e_ = e
                    e_msg = print_error(e)[0]
                    if try_ < n_try - 1:
                        self.print_('retry... _pagination ({})\n{}'.format(
                            try_ + 1, e_msg))
                        sleep(30, self.cw)
            else:
                break  #raise e_ #3392

            users = data["globalObjects"]["users"]
            for instr in data["timeline"]["instructions"]:
                for entry in instr.get("addEntries", {}).get("entries", []):
                    if entry["entryId"].startswith(entry_tweet):
                        tid = entry["content"]["item"]["content"]["tweet"][
                            "id"]
                        if tid not in tweets:
                            self.print_(
                                "Skipping unavailable Tweet {}".format(tid))
                            continue
                        #print('tid:', tid)#
                        tweet = tweets[tid]
                        tweet["user"] = users[tweet["user_id_str"]]

                        yield tweet

                    elif entry["entryId"].startswith(entry_cursor):
                        cursor = entry["content"]["operation"]["cursor"][
                            "value"]

                if not cursor or params.get('cursor') == cursor:
                    print('same cursor')
                    return
                params["cursor"] = cursor
            if params.get("cursor") is None:  # nothing
                self.print_('no cursor')
                break
    def __init__(self, tweet, format, types, session, cw):
        print_ = get_print(cw)
        self.tweet = tweet
        self.session = session
        self.username = tweet.attrs['data-screen-name']
        self.id = int(tweet.attrs['data-tweet-id'])
        for span in tweet.findAll('span'):
            time = span.attrs.get('data-time')
            if time:
                break
        else:
            time_ms = (id >> 22) + 1288834974657
            time = time_ms / 1000
        self.time = time
        self.url = urljoin('https://twitter.com',
                           tweet.attrs['data-permalink-path'])
        self.withheld = 'withheld-tweet' in tweet.attrs
        if self.withheld:
            print_(('    withheld: {}').format(self.id))
        urls = []
        if 'img' in types:
            for div in tweet.findAll('div'):
                url = div.attrs.get('data-image-url')
                if not url:
                    continue
                if ':' not in os.path.basename(url):
                    url += ':orig'
                urls.append(url)

        if 'img' in types:
            for a in tweet.findAll('a'):
                url = a.attrs.get('data-expanded-url', '')
                if '//twitpic.com/' not in url:
                    continue
                print_(('twitpic: {}, {}').format(self.id, url))
                try:
                    url = get_twitpic(url, session)
                    if url in urls:
                        print('duplicate twitpic')
                        continue
                    urls.append(url)
                except Exception as e:
                    print_(('Failed to read twitpic:\n{}').format(
                        print_error(e)[(-1)]))

        if 'grid-tweet' in tweet.attrs['class']:
            url = tweet.attrs['data-url'] + ':large'
            urls.append(url)
        self.imgs = []
        for page, url in enumerate(urls):
            img = Image(url, self.url, self.id, self.time, page, format, cw)
            self.imgs.append(img)

        if 'PlayableMedia-container' in str(tweet):
            self.isVideo = True
            if 'video' in types:
                img = Image(self.url, self.url, self.id, self.time, 0, format,
                            cw, True)
                self.imgs.append(img)
def get_imgs_page(id_art, session, date=None, cw=None):
    print_ = get_print(cw)
    url_json = 'https://www.artstation.com/projects/{}.json'.format(id_art)
    post_url = 'https://www.artstation.com/artwork/{}'.format(id_art)
    try:
        html = downloader.read_html(url_json,
                                    session=session,
                                    referer=post_url)
        data = json.loads(html)
        imgs_ = data['assets']
    except Exception as e:
        print_(print_error(e)[(-1)])
        return []

    if date is None:
        date = data['created_at'][2:10]

    imgs = []
    for page, img in enumerate(imgs_):
        if not img['has_image']:
            print('no img')
            continue
        url = None
        video = None
        embed = img.get('player_embedded')
        if embed:
            soup = Soup(embed)
            url_embed = soup.find('iframe').attrs['src']
            print_('embed: {}'.format(url_embed))
            try:
                html = downloader.read_html(url_embed,
                                            session=session,
                                            referer=post_url)
                soup = Soup(html)
                url = soup.find('video').find('source').attrs['src']
            except Exception as e:
                pass
            if not url:
                try:
                    url = soup.find('link', {'rel': 'canonical'}).attrs['href']
                    print_('YouTube: {}'.format(url))
                    raise Exception('YouTube')
##                    from extractor import youtube_downloader
##                    video = youtube_downloader.Video(url, cw=cw)
                except Exception as e:
                    print(e)
                    url = None
        if not url:
            url = img['image_url']
        if video:
            img = video
        else:
            img = Image(post_url, date, url, page)

        img.data = data  #
        imgs.append(img)

    return imgs
 def get(self, _):
     print_ = get_print(self.cw)
     url = self._url
     ext = get_ext(url)
     if ext.lower() == '.gif':
         print_('get_ext: {}, {}'.format(self.id_, url))
         try:
             ext = downloader.get_ext(url)
         except Exception as e: #3235
             print_('Err: {}, {}\n'.format(self.id_, url)+print_error(e)[0])
     self.filename = '{}_p{}{}'.format(self.id_, self.p, ext)
     return url
def get_imgs_from_tweet(tweet, session, types, format, cw=None):
    print_ = get_print(cw)
    id = tweet['id_str']

    if 'extended_entities' not in tweet:
        tweet['extended_entities'] = {'media': []}

    for url_ in tweet['entities'].get('urls', []):
        url_ = url_['expanded_url']
        if '//twitpic.com/' in url_:
            print_('twitpic: {}'.format(url_))
            try:
                url_ = get_twitpic(url_, session)
                tweet['extended_entities']['media'].append({
                    'type':
                    'photo',
                    'media_url':
                    url_,
                    'expanded_url':
                    'https://twitter.com'
                })
            except Exception as e:
                print_('Invalid twitpic')
                print_(print_error(e)[-1])

    media = tweet['extended_entities']['media']

    time = get_time(tweet)

    imgs = []
    for m in media:
        type_ = m['type']
        if type_ == 'photo':
            type_ = 'img'
        elif type_ == 'animated_gif':
            type_ = 'video'
        if type_ not in types:
            continue
        if type_ == 'video':
            url_media = sorted(m['video_info']['variants'],
                               key=lambda x: x.get('bitrate', 0))[-1]['url']
        elif type_ == 'img':
            url_media = m['media_url']
            if ':' not in os.path.basename(url_media):
                url_media += ':orig'
        else:
            raise NotImplementedError('unknown type')
        url = m['expanded_url']
        img = Image(url_media, url, id, time, len(imgs), format, cw,
                    type_ == 'video')
        imgs.append(img)

    return imgs
 def get(self, _):
     print_ = get_print(self.cw)
     url = self._url
     ext = get_ext(url)
     if ext.lower()[1:] not in ['jpg', 'png', 'mp4']:  #4645
         print_('get_ext: {}, {}'.format(self.id_, url))
         try:
             ext = downloader.get_ext(url, referer=_)
         except Exception as e:  #3235
             print_('Err: {}, {}\n'.format(self.id_, url) +
                    print_error(e)[0])
     self.filename = '{}_p{}{}'.format(self.id_, self.p, ext)
     return url
    def pp(self, filename):
        cw = self.cw
        print_ = get_print(cw)
        ui_setting = utils.ui_setting
        ext = os.path.splitext(filename)[1].lower()
        if not os.path.isfile(filename):
            print('no file: {}'.format(filename))
            return
        
        filename_new = None
        if self.type == 'video' and (self.audio is not None or ext != '.mp4') and not self.stream.live: # UHD or non-mp4
            if self.audio is not None: # merge
                print_('Download audio: {}'.format(self.audio))
                hash = uuid()
                path = os.path.join(os.path.dirname(filename), '{}_a.tmp'.format(hash))
                if cw is not None:
                    cw.trash_can.append(path)
                if constants.FAST:
                    downloader_v3.download(self.audio, chunk=1024*1024, n_threads=2, outdir=os.path.dirname(path), fileName=os.path.basename(path), customWidget=cw, overwrite=True)
                else:
                    downloader.download(self.audio, outdir=os.path.dirname(path), fileName=os.path.basename(path), customWidget=cw, overwrite=True)
                ext, out = ffmpeg.merge(filename, path, cw=cw, vcodec=self.vcodec)
                #print(out)
                name, ext_old = os.path.splitext(filename)
                if ext_old.lower() != ext.lower():
                    print_('rename ext {} --> {}'.format(ext_old, ext))
                    filename_new = '{}{}'.format(name, ext)
                    if os.path.isfile(filename_new):
                        os.remove(filename_new)
                    os.rename(filename, filename_new)
            else: # convert non-mp4 video -> mp4
                name, ext_old = os.path.splitext(filename)
                filename_new = '{}.mp4'.format(name)
                print_('Convert video: {} -> {}'.format(filename, filename_new))
                ffmpeg.convert(filename, filename_new, cw=cw)
        elif self.type == 'audio' and ext != '.mp3': # convert non-mp3 audio -> mp3
            name, ext_old = os.path.splitext(filename)
            filename_new = '{}.mp3'.format(name)
            ffmpeg.convert(filename, filename_new, '-shortest -preset ultrafast -b:a {}k'.format(get_abr()), cw=cw)

        if self.type == 'audio' and ui_setting.albumArt.isChecked():
            try:
                self.thumb.seek(0)#
                ffmpeg.add_cover(filename_new, self.thumb, {'artist':self.username, 'title':self.title}, cw=cw)
            except Exception as e:
                s = print_error(e)[-1]
                print_(s)

        utils.pp_subtitle(self, filename, cw)

        return filename_new
Beispiel #14
0
    def read(self):
        ui_setting = self.ui_setting
        cw = self.cw
        print_ = get_print(cw)
        if self.yt_type == 'video':
            res = get_resolution()
            info = get_videos(self.url,
                              type=self.yt_type,
                              max_res=res,
                              only_mp4=False,
                              audio_included=not True,
                              cw=cw)
        else:
            abr = get_abr()
            info = get_videos(self.url, type=self.yt_type, max_abr=abr, cw=cw)
        videos = info['videos']

        if not videos:
            raise Exception('No videos')

        self.enableSegment(overwrite=True)

        # first video must be valid
        while videos:
            video = videos[0]
            try:
                video.url()
                break
            except Exception as e:
                e_ = e
                self.print_(print_error(e)[0])
                videos.remove(video)
        else:
            raise e_

        if info['type'] != 'single':
            video = self.process_playlist(info['title'], videos)
        else:
            self.urls.append(video.url)
            self.title = video.title
            if video.stream.live:
                self.lock = False

        self.artist = video.username
        self.setIcon(video.thumb)
Beispiel #15
0
def foo(url, soup, info, reblog=False):
    #print('foo', info['c'], len(info['ids']))
    for post in soup.findAll('div', class_='wrap-post'):
        try:
            id = int(re.find('[0-9]+', post.attrs['class'][1]))
        except Exception as e:
            print(print_error(e)[-1])
            continue
        if id in info['ids']:
            continue
        info['ids'].add(id)
        info['last'] = id
        if not reblog and post.find('div', class_='ogname'):
            continue
        for p, mag in enumerate(post.findAll(['a', 'div'], class_='magnify')):
            post = Post(mag.attrs['href'], url, id, p)
            info['posts'].append(post)
    info['c'] += 20 if info['c'] else 5
Beispiel #16
0
    def get(self, _=None):
        if self._url_cache:
            return self._url_cache
        print_ = get_print(self.cw)
        for try_ in range(self.try_n):
            try:
                d = ytdl.YoutubeDL(cw=self.cw)
                info = d.extract_info(self.referer)

                fs = info['formats']
                for f in fs:
                    print_('{} {} - {}'.format(f.get('height'), f['protocol'],
                                               f['url']))

                def key(f):
                    h = f.get('height', 0)
                    if not f['protocol'].startswith('http'):
                        h -= .1
                    return h

                for f in sorted(fs, key=key, reverse=True):
                    if downloader.ok_url(f['url'], self.referer):  #4185
                        break
                    else:
                        print_('invalid video: {}'.format(f['url']))
                else:
                    raise Exception('no valid videos')
                url = f['url']
                ext = get_ext(url)
                self.ext = ext
                print_('get_video: {} {}'.format(url, ext))
                if ext.lower() == '.m3u8':
                    url = ffmpeg.Stream(url)
                    url._live = False
                self._url_cache = url
                return url
            except Exception as e:
                e_ = e
                msg = print_error(e)[0]
                print_('\nTwitter video Error:\n{}'.format(msg))
                if try_ < self.try_n - 1:
                    sleep(10, self.cw)
        else:
            raise e_
 def run(self):
     while self.alive:
         try:
             id_, res, i = self.queue.popleft()
         except Exception as e:
             sleep(.1)
             continue
         try:
             info_illust = get_info(
                 'https://www.pixiv.net/en/artworks/{}'.format(id_),
                 cw,
                 depth=depth + 1)
             res[i] = info_illust['imgs']
         except Exception as e:
             if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました'
                                or type(e) == errors.LoginRequired
                                ):  # logout during extraction
                 res[i] = e
             print_('process_ids error ({}):\n{}'.format(
                 depth,
                 print_error(e)[0]))
         finally:
             Thread.add_rem(-1)
Beispiel #18
0
def get_imgs(url,
             title=None,
             customWidget=None,
             d=None,
             types=['img', 'gif', 'video'],
             session=None):
    if False:  #
        raise NotImplementedError('Not Implemented')
    print_ = get_print(customWidget)
    print_(u'types: {}'.format(', '.join(types)))

    # Range
    max_pid = get_max_range(customWidget, 2000)

    local_ids = {}
    if customWidget is not None:
        dir = customWidget.downloader.dir
        try:
            names = os.listdir(dir)
        except Exception as e:
            print(e)
            names = []
        for name in names:
            id = os.path.splitext(name)[0]
            local_ids[id] = os.path.join(dir, name)

    imgs = []
    page = 1
    url_imgs = set()
    if 'chan.sankakucomplex' in url:
        type = 'chan'
    elif 'idol.sankakucomplex' in url:
        type = 'idol'
    else:
        raise Exception('Not supported subdomain')
    url_old = 'https://{}.sankakucomplex.com'.format(type)
    if customWidget is not None:
        customWidget.exec_queue.put(
            (customWidget, u"customWidget.setTitle(u'{}  {}')".format(
                tr_(u'읽는 중...'), title)))
    while len(imgs) < max_pid:
        #if page > 25: # Anonymous users can only view 25 pages of results
        #    break
        sleep(1)  #
        #url = setPage(url, page)
        print_(url)
        html = downloader.read_html(url, referer=url_old, session=session)
        if '429 Too many requests'.lower() in html.lower():
            print_('429 Too many requests... wait 120 secs')
            for i in range(120):
                sleep(1)
                if customWidget and not customWidget.alive:
                    return []
            continue
        page += 1
        url_old = url
        soup = Soup(html)
        articles = soup.findAll('span', {'class': 'thumb'})

        if not articles:
            break

        for article in articles:
            # 1183
            tags = article.find('img', class_='preview').attrs['title'].split()
            if 'animated_gif' in tags:
                type_ = 'gif'
            elif 'animated' in tags or 'webm' in tags or 'video' in tags or 'mp4' in tags:  # 1697
                type_ = 'video'
            else:
                type_ = 'img'
            if type_ not in types:
                continue

            url_img = article.a.attrs['href']
            if not url_img.startswith('http'):
                url_img = urljoin('https://{}.sankakucomplex.com'.format(type),
                                  url_img)
            id = re.find('show/([0-9]+)', url_img)
            print_(article)
            if id is None:  # sankaku plus
                continue
            if id in local_ids:
                #print('skip', id)
                local = True
            else:
                local = False
            #print(url_img)
            if url_img not in url_imgs:
                url_imgs.add(url_img)
                if local:
                    url_img = local_ids[id]
                img = Image(type,
                            id,
                            url_img,
                            url,
                            local=local,
                            cw=customWidget,
                            d=d)
                imgs.append(img)
                if len(imgs) >= max_pid:
                    break
        if customWidget and not customWidget.alive:
            break

        try:
            # For page > 50
            pagination = soup.find('div', class_='pagination')
            url = urljoin('https://{}.sankakucomplex.com'.format(type),
                          pagination.attrs['next-page-url'])
        except Exception as e:
            print_(print_error(e)[-1])
            #url = setPage(url, page)
            break

        if customWidget is not None:
            customWidget.setTitle(u'{}  {} - {}'.format(
                tr_(u'읽는 중...'), title, len(imgs)))
        else:
            print(len(imgs), 'imgs')

    if not imgs:
        raise Exception('no images')

    return imgs
    def get(self, url, force=False):
        if self._url:
            return self._url

        type = self.type
        only_mp4 = self.only_mp4
        audio_included = self.audio_included
        max_res = self.max_res
        max_abr = self.max_abr
        cw = self.cw
        print_ = get_print(cw)

        if force:
            max_abr = 0

        print('max_res: {}'.format(max_res))
        for try_ in range(8):
            try:
                yt = ytdl.YouTube(url)
                break
            except Exception as e:
                e_ = e
                s = print_error(e)[-1]
                print_('### youtube retry...\n{}'.format(s))
                sleep(try_ / 2, cw)
        else:
            raise e_

        streams = yt.streams.all()
        print_streams(streams, cw)

        #3528
        time = datetime.strptime(yt.info['upload_date'], '%Y%m%d')
        self.utime = (time - datetime(1970, 1, 1)).total_seconds()
        print_('utime: {}'.format(self.utime))

        if type == 'video':
            streams[:] = [
                stream for stream in streams if stream.video_codec is not None
            ]
            # Only mp4
            if only_mp4:
                streams_ = list(streams)
                streams[:] = []
                for stream in streams_:
                    if stream.subtype == 'mp4':
                        streams.append(stream)

            # Audio included; Non-HD
            if audio_included:
                streams_ = list(streams)
                streams[:] = []
                for stream in streams_:
                    if stream.audio_codec is not None:
                        streams.append(stream)

            # Maximum resolution
            streams_ = list(streams)
            streams[:] = []
            for stream in streams_:
                if stream.resolution is None:
                    continue
                res = int(stream.resolution.replace('p', ''))
                if max_res is None or res <= max_res:
                    streams.append(stream)
            print_('')
        elif type == 'audio':
            streams[:] = [stream for stream in streams if stream.abr]
            # Maximum abr
            abrs = [stream.abr for stream in streams]
            max_abr = min(max(abrs), max_abr)
            streams_ = list(streams)
            streams[:] = []
            for stream in streams_:
                if stream.abr is None:
                    continue
                abr = stream.abr
                if max_abr is None or abr >= max_abr:
                    streams.append(stream)
            #'''
        else:
            raise Exception(u'type "{}" is not supported'.format(type))

        # Pick the best
        while streams:
            if type == 'video':
                ress = [
                    int_(stream.resolution.replace('p', ''))
                    for stream in streams
                ]
                m = max(ress)
                prefer_format = 'mp4'
            elif type == 'audio':
                ress = [stream.abr for stream in streams]
                m = min(ress)
                prefer_format = 'webm'
            print('Resolutions:', ress)
            stream_final = None
            for stream, res in zip(streams, ress):
                if res == m:
                    if type == 'video':
                        foo = (stream_final is not None) and (
                            stream_final.audio_codec is None) and bool(
                                stream.audio_codec)
                    elif type == 'audio':
                        foo = False
                    if stream_final is None or (
                            stream_final.fps <= stream.fps and
                        (foo or (stream_final.subtype.lower() != prefer_format
                                 and stream.subtype.lower() == prefer_format)
                         or stream_final.fps < stream.fps)):
                        #print(foo)
                        print_(u'# stream_final {} {} {} {} {} {}fps'.format(
                            stream, stream.format, stream.resolution,
                            stream.subtype, stream.audio_codec, stream.fps))
                        stream_final = stream

            ok = downloader.ok_url(stream_final.url,
                                   referer=url) if isinstance(
                                       stream_final.url, str) else True
            if ok:
                break
            else:
                print_('stream is not valid')
                streams.remove(stream_final)
        else:
            if type == 'audio' and not force:
                return self.get(url, force=True)  # 1776
            raise Exception('No videos')

        stream = stream_final

        ##        if stream.video_codec and stream_final.video_codec.lower().startswith('av'):
        ##            self.vcodec = 'h264'

        self.yt = yt
        self.id = yt.video_id
        self.stream = stream
        self.username = yt.info['uploader']
        self.stream_audio = None
        self.audio = None
        self.thumb = None
        self.thumb_url = None
        self.subtitles = yt.subtitles

        if type == 'audio' and 'DASH' in self.stream.format:
            self.stream.setDashType('audio')

        # Audio
        if type == 'video' and stream.audio_codec is None:
            print('audio required')
            streams = [stream for stream in yt.streams.all() if stream.abr]
            print_streams(streams, cw)

            # only mp4; https://github.com/KurtBestor/Hitomi-Downloader-issues/issues/480
            def isGood(stream):
                return stream.audio_codec.lower().startswith('mp4')

            streams_good = [stream for stream in streams if isGood(stream)]
            if streams_good:
                streams = streams_good
                print_streams(streams, cw)
            # only audio?
            if any(stream.resolution is None for stream in streams):
                streams = [
                    stream for stream in streams if stream.resolution is None
                ]
                print_streams(streams, cw)
            best_audio = None
            best_abr = 0
            for stream in streams:
                abr = stream.abr
                if abr > best_abr:
                    best_abr = abr
                    best_audio = stream
            if best_audio is None:
                raise Exception('No audio')
            print(best_audio)
            self.stream_audio = best_audio
            if 'DASH' in self.stream_audio.format:
                self.stream_audio.setDashType('audio')
            self.audio = best_audio.url
            if callable(self.audio):
                self.audio = self.audio()

        # Thumbnail
        for quality in ['sddefault', 'hqdefault', 'mqdefault', 'default']:
            print('####', yt.thumbnail_url)
            self.thumb_url = yt.thumbnail_url.replace('default', quality)
            f = BytesIO()
            try:
                downloader.download(self.thumb_url, buffer=f)
                data = f.read()
                if len(data) == 0:
                    raise AssertionError('Zero thumbnail')
                if data == empty_thumbnail:
                    raise AssertionError('Empty thumbnail')
                f.seek(0)
                break
            except Exception as e:
                print(print_error(e)[-1])
        self.thumb = f

        #
        _url = self.stream.url
        if callable(_url):
            _url = _url()
        self._url = _url
        title = yt.title
        #soup = Soup(yt.watch_html)
        #title =  soup.title.text.replace('- YouTube', '').strip()
        self.title = title
        ext = u'.' + self.stream.subtype
        self.filename = format_filename(title, self.id, ext)

        print_(u'Resolution: {}'.format(stream.resolution))
        print_(u'Codec: {} / {}'.format(stream.video_codec,
                                        stream.audio_codec))
        print_(u'Abr: {}'.format(stream.abr))
        print_(u'Subtype: {}'.format(stream.subtype))
        print_(u'FPS: {}\n'.format(stream.fps))

        return self._url
    def _pp(self, filename):
        cw = self.cw
        print_ = get_print(cw)
        ui_setting = utils.ui_setting
        ext = os.path.splitext(filename)[1].lower()
        if not os.path.isfile(filename):
            print(u'no file: {}'.format(filename))
            return

        filename_new = None
        if self.type == 'video' and (self.audio is not None
                                     or ext != '.mp4'):  # UHD or non-mp4
            if self.audio is not None:  # merge
                print_(u'Download audio: {}'.format(self.audio))
                hash = uuid()
                path = os.path.join(os.path.dirname(filename),
                                    '{}_a.tmp'.format(hash))
                if cw is not None:
                    cw.trash_can.append(path)
                if constants.FAST:
                    downloader_v3.download(self.audio,
                                           chunk=1024 * 1024,
                                           n_threads=2,
                                           outdir=os.path.dirname(path),
                                           fileName=os.path.basename(path),
                                           customWidget=cw,
                                           overwrite=True)
                else:
                    downloader.download(self.audio,
                                        outdir=os.path.dirname(path),
                                        fileName=os.path.basename(path),
                                        customWidget=cw,
                                        overwrite=True)
                ext, out = ffmpeg.merge(filename,
                                        path,
                                        cw=cw,
                                        vcodec=self.vcodec)
                #print(out)
                name, ext_old = os.path.splitext(filename)
                if ext_old.lower() != ext.lower():
                    print_(u'rename ext {} --> {}'.format(ext_old, ext))
                    filename_new = u'{}{}'.format(name, ext)
                    if os.path.isfile(filename_new):
                        os.remove(filename_new)
                    os.rename(filename, filename_new)
            else:  # convert non-mp4 video -> mp4
                name, ext_old = os.path.splitext(filename)
                filename_new = u'{}.mp4'.format(name)
                print_(u'Convert video: {} -> {}'.format(
                    filename, filename_new))
                ffmpeg.convert(filename, filename_new, cw=cw)
        elif self.type == 'audio' and ext != '.mp3':  # convert non-mp3 audio -> mp3
            name, ext_old = os.path.splitext(filename)
            filename_new = u'{}.mp3'.format(name)
            ffmpeg.convert(filename,
                           filename_new,
                           '-shortest -preset ultrafast -b:a {}k'.format(
                               get_abr()),
                           cw=cw)

        if self.type == 'audio' and ui_setting.albumArt.isChecked():
            try:
                self.thumb.seek(0)  #
                ffmpeg.add_cover(filename_new,
                                 self.thumb, {
                                     'artist': self.username,
                                     'title': self.title
                                 },
                                 cw=cw)
            except Exception as e:
                s = print_error(e)[-1]
                print_(s)

        if ui_setting and ui_setting.subtitle.isChecked():
            lang = {
                'korean': 'ko',
                'english': 'en',
                'japanese': 'ja'
            }[compatstr(ui_setting.subtitleCombo.currentText()).lower()]
            if lang in self.subtitles:
                try:
                    subtitle = self.subtitles[lang]
                    filename_sub = u'{}.vtt'.format(
                        os.path.splitext(filename)[0])
                    downloader.download(
                        subtitle,
                        os.path.dirname(filename_sub),
                        fileName=os.path.basename(filename_sub),
                        overwrite=True)
                    filename_sub_new = u'{}.srt'.format(
                        os.path.splitext(filename_sub)[0])
                    cw.imgs.append(filename_sub_new)
                    cw.dones.add(
                        os.path.realpath(filename_sub_new).replace(
                            '\\\\?\\', ''))
                    srt_converter.convert(filename_sub, filename_sub_new)
                    cw.setSubtitle(True)
                finally:
                    try:
                        os.remove(filename_sub)
                    except:
                        pass

        return filename_new
Beispiel #21
0
    def init(self):
        self.url = clean_url(self.url)
        url = self.url

        # Determine the type
        if 'bookmark.php?type=user' in url or url.startswith(headers['following']):
            type = 'following'
        elif 'bookmark.php' in url or url.startswith(headers['bookmark']) or '/bookmarks/' in url:
            type = 'bookmark'
        elif 'illust_id=' in url or url.startswith(headers['illust']) or '/artworks/' in url:
            type = 'illust'
        elif 'search.php' in url or url.startswith(headers['search']):
            type = 'search'
            order = query_url(url).get('order', ['date_d'])[0] # data_d, date, popular_d, popular_male_d, popular_female_d
            scd = query_url(url).get('scd', [None])[0] # 2019-09-27
            ecd = query_url(url).get('ecd', [None])[0] # 2019-09-28
            blt = query_url(url).get('blt', [None])[0] # 5000
            bgt = query_url(url).get('bgt', [None])[0] # 9999
            type_ = query_url(url).get('type', [None])[0] # None (all), illust, manga, ugoira
            self.info = {'order': order, 
               'scd': scd, 
               'ecd': ecd, 
               'blt': blt, 
               'bgt': bgt, 
               'type': type_}
        elif '/tags/' in url:
            type = 'search'
            order = query_url(url).get('order', ['date_d'])[0]
            scd = query_url(url).get('scd', [None])[0]
            ecd = query_url(url).get('ecd', [None])[0]
            blt = query_url(url).get('blt', [None])[0]
            bgt = query_url(url).get('bgt', [None])[0]
            type_ = query_url(url).get('type', [None])[0] # None (all), illust, manga, ugoira
            if type_ is None:
                try:
                    type_ = url.split('/tags/')[1].split('/')[1]
                except IndexError:
                    type_ = None
                type_ = {'illustrations': 'illust'}.get(type_, type_)
            self.info = {'order': order, 
               'scd': scd, 
               'ecd': ecd, 
               'blt': blt, 
               'bgt': bgt, 
               'type': type_}
        elif 'id=' in url and 'mode=' not in url or url.startswith(headers['user']) or 'pixiv.me' in url or '/users/' in url:
            type = 'user'
        else:
            self.Invalid((u'[pixiv] Can not determine type: {}').format(url))
            return 'stop'
        header = headers[type]
        if 'pixiv.net' in url or 'pixiv.me' in url:
            if not url.startswith('http://') and not url.startswith('https://'):
                url = u'https://' + url
            self.url = url
        else:
            url = url.replace('bmk_', '').replace('illust_', '').replace('pixiv_', '').replace('search_', '')
            if type == 'user':
                url = 'https://www.pixiv.net/member_illust.php?id={}'.format(url)
            elif type == 'bookmark':
                url = 'https://www.pixiv.net/bookmark.php?id={}'.format(url)
            elif type == 'illust':
                url = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id={}'.format(url)
            elif type == 'search':
                url = 'https://www.pixiv.net/search.php?s_mode=s_tag&word={}'.format(url)
                url = clean_url(url)
            else:
                self.Invalid('{}{}: ???'.format(header, url))
                return 'stop'
            self.url = url
        self.print_('PIXIV_TYPE: {}'.format(type))
        self.pixiv_type = type
        try:
            self.api = pixiv_auth.get_api()
            if 'error' in self.api.user_detail(11):
                self.api = pixiv_auth.get_api(force=True)
        except Exception as e:
            self.print_(print_error(e)[0])
            self.Invalid(tr_('로그인 실패: {}{}\n[옵션 - 설정 - 픽시브 설정 - 로그인] 에서 설정해주세요.').format(header, url))
            return 'stop'
Beispiel #22
0
def get_imgs(url, n_max=2000, title=None, cw=None, session=None):
    print_ = get_print(cw)

    for try_ in range(4):
        try:
            html = read_html(url, session, cw)
            m = re.search('"edge_owner_to_timeline_media":{"count":([0-9]+)',
                          html)
            if m is None:
                raise Exception('Invalid page')
            break
        except Exception as e:
            e_ = e
            print_(print_error(e)[0])
    else:
        raise e_
    n = int(m.groups()[0])
    n = min(n, n_max)

    data = get_sd(url, html=html, cw=cw)

    uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
    csrf_token = data['config']['csrf_token']  #
    session.cookies.set(name='ig_pr',
                        value='1',
                        path='/',
                        domain='.instagram.com')

    cursor = ''
    edges = []
    bad = 0
    while True:
        check_alive(cw)

        variables = {
            'id': uploader_id,
            'first': 12,
        }
        if cursor:
            variables['after'] = cursor
        #print_(variables)#

        media = None
        try:
            j = get_query('003056d32c2554def87228bc3fd9668a', variables,
                          session, cw)
            media = j['data']['user']['edge_owner_to_timeline_media']
            sleep(2)  #
        except Exception as e:
            if bad > 10:
                raise Exception('no media')
            else:
                print_(u'no media.. retry... ({}) {}'.format(
                    bad + 1,
                    print_error(e)[0]))
                sleep(12 * bad, cw)
                bad += 1
                continue
        bad = 0

        edges_new = media.get('edges')
        if not edges_new or not isinstance(edges_new, list):
            print('no edges_new')
            break

        edges += edges_new

        s = u'{} {}  ({}/{})'.format(tr_(u'읽는 중...'), title, len(edges), n)
        if cw is not None:
            cw.setTitle(s)
            if not cw.alive:
                return []
        else:
            print(s)

        if len(edges) >= n:
            break

        page_info = media.get('page_info')
        if not page_info:
            break
        if not page_info.get('has_next_page'):
            break
        cursor = page_info.get('end_cursor')
        if not cursor:
            break

    if len(edges) <= n / 2:
        raise Exception(u'Too short: {} / {}'.format(len(edges), n))

    imgs = []
    for edge in edges:
        node = edge['node']
        type = node['__typename']
        id = node['shortcode']
        url = u'https://www.instagram.com/p/{}/'.format(id)
        ##        if type in ['GraphVideo', 'GraphImage']:
        ##            single = True
        ##        else:
        ##            single = False
        for img in Node(url, session=session, cw=cw, media=node).imgs:
            imgs.append(img)
        if len(imgs) >= n_max:
            break

    return imgs
    def _pagination(self,
                    url_api,
                    params=None,
                    entry_tweet="tweet-",
                    entry_cursor="cursor-bottom-"):
        if params is None:
            params = self.params.copy()

        while True:
            cursor = None
            self.print_('cursor: {}'.format(params.get("cursor")))

            # 2303
            n_try = 20
            for try_ in range(n_try):
                try:
                    data = self._call(url_api, params=params)
                    tweets = data["globalObjects"]["tweets"]
                    break
                except Exception as e:
                    e_ = e
                    e_msg = print_error(e)[0]
                    if try_ < n_try - 1:
                        self.print_('retry... _pagination ({})\n{}'.format(
                            try_ + 1, e_msg))
                        sleep(30)
            else:
                raise e_

            users = data["globalObjects"]["users"]
            for instr in data["timeline"]["instructions"]:
                for entry in instr.get("addEntries", {}).get("entries", []):
                    if entry["entryId"].startswith(entry_tweet):
                        tid = entry["content"]["item"]["content"]["tweet"][
                            "id"]
                        if tid not in tweets:
                            self.print_(
                                "Skipping unavailable Tweet {}".format(tid))
                            continue
                        tweet = tweets[tid]
                        tweet["user"] = users[tweet["user_id_str"]]

                        ##                        if "quoted_status_id_str" in tweet:
                        ##                            quoted = tweets[tweet["quoted_status_id_str"]]
                        ##                            tweet["author"] = tweet["user"]
                        ##                            if "extended_entities" in quoted:
                        ##                                tweet["extended_entities"] = \
                        ##                                    quoted["extended_entities"]
                        ##                        elif "retweeted_status_id_str" in tweet:
                        ##                            retweet = tweets[tweet["retweeted_status_id_str"]]
                        ##                            tweet["author"] = users[retweet["user_id_str"]]
                        ##                        else:
                        ##                            tweet["author"] = tweet["user"]

                        yield tweet

                    elif entry["entryId"].startswith(entry_cursor):
                        cursor = entry["content"]["operation"]["cursor"][
                            "value"]

                if not cursor or params.get('cursor') == cursor:
                    print('same cursor')
                    return
                params["cursor"] = cursor
            if params.get("cursor") is None:  # nothing
                break
def get_imgs_legacy(username,
                    session,
                    title,
                    types,
                    n=None,
                    format='[%y-%m-%d] id_ppage',
                    cw=None,
                    mode='media',
                    method='tab',
                    imgs=None):
    print_ = get_print(cw)
    print_('types: {}'.format(', '.join(types)))

    artist, username = get_artist_username(username, session)  #

    # Range
    n = max(n, get_max_range(cw))

    max_pos = None
    ids_set = set()
    if imgs:
        for img in imgs:
            ids_set.add(img.id)
    else:
        imgs = []
    f**k = 0
    min_position = None
    while len(imgs) < n:
        if mode == 'media':
            if method == 'tab':
                foo = '&max_position={}'.format(
                    max_pos) if max_pos is not None else ''
                url = 'https://twitter.com/i/profiles/show/{}/media_timeline?include_available_features=1&include_entities=1{}&reset_error_state=false'.format(
                    username, foo)
                print_('max_pos={},  imgs={}'.format(max_pos, len(imgs)))
            elif method == 'search':  # 1028
                max_id = min(ids_set) - 1 if ids_set else None
                if ids_set:
                    q = 'from:{} max_id:{} exclude:retweets filter:media -filter:periscope'.format(
                        username, max_id)
                else:
                    q = 'from:{} exclude:retweets filter:media -filter:periscope'.format(
                        username)
                q = quote(q, '')
                url = 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q={}&src=typd&include_available_features=1&include_entities=1&reset_error_state=false'.format(
                    q)
                print_('max_id={},  imgs={}'.format(max_id, len(imgs)))
            elif method == 'search2':  # 1028
                max_id = min(ids_set) - 1 if ids_set else None
                if ids_set:
                    q = 'from:{} max_id:{} exclude:retweets filter:media -filter:periscope'.format(
                        username, max_id)
                else:
                    q = 'from:{} exclude:retweets filter:media -filter:periscope'.format(
                        username)
                q = quote(q, '')
                foo = '&max_position={}'.format(
                    max_pos) if max_pos is not None else ''
                url = 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q={}&src=typd&include_available_features=1&include_entities=1{}&reset_error_state=false'.format(
                    q, foo)
                print_('max_pos={},  max_id={},  imgs={}'.format(
                    max_pos, max_id, len(imgs)))
            else:
                raise Exception('Invalid method: {}'.format(method))
        elif mode == 'likes':
            foo = '&max_position={}'.format(
                max_pos) if max_pos is not None else ''
            url = 'https://twitter.com/{}/likes/timeline?include_available_features=1&include_entities=1{}&reset_error_state=false'.format(
                username, foo)
        print(url)

        hdr = {
            "X-Requested-With": "XMLHttpRequest",
            "X-Twitter-Active-User": "******",
        }

        for try_ in range(16):
            if cw and not cw.alive:
                return
            try:
                html = downloader.read_html(
                    url,
                    session=session,
                    referer='https://twitter.com/{}'.format(username),
                    headers=hdr)  #err
            except Exception as e:
                e_msg = print_error(e)[-1]
                print_('retry... ({}) {}\n{}'.format(try_, url, e_msg))
                change_ua(session)
                continue
            try:
                data = json.loads(html)
            except Exception as e:
                change_ua(session)
                soup = Soup(html)
                login = soup.find('div', class_='LoginForm-input')
                if login and method == 'tab':
                    raise Exception('Login required!')
                print_('can not load json: {}'.format(e))
                sleep(1)
                continue
            break
        else:
            print_('over try')
            if not imgs:
                raise Exception('No imgs')
            break

        if 'items_html' in data:
            html = data['items_html']
        else:
            print_('no items_html')
            session.cookies.clear()  # ???
            #break

        soup = Soup(html)
        tweets = soup.findAll('div', class_='tweet') + soup.findAll(
            'span', class_='grid-tweet')

        ids = []
        for tweet in tweets:
            id = int(tweet.attrs['data-tweet-id'])
            if id in ids_set:
                print('duplicate')
                continue
            ids.append(id)
            ids_set.add(id)
            tweet = Tweet(tweet, format, types, session, cw)
            for img in tweet.imgs:
                imgs.append(img)

        if n is not None and len(imgs) >= n:
            break

        if not ids:
            foo = 4 if method != 'search2' else 16
            if len(imgs) == 0:
                raise Exception('No Image')
            elif f**k > foo:
                if method == 'tab':  ### search
                    method = 'search'
                    f**k = 0
                    continue
                elif method == 'search' and not ids and min_position is not None:  ### search2
                    method = 'search2'
                    max_pos = min_position
                    #min_position = None
                    f**k = 0
                    continue
                else:
                    print('too much f**k')
                    break
            else:
                print('f**k!!!!!')
                change_ua(session)
                f**k += 1
        elif f**k:
            print('reset f**k')
            f**k = 0

        max_pos_new = data.get('min_position')  # 1028
        if max_pos_new is None:
            if ids:
                max_pos_new = min(ids)
            else:
                max_pos_new = max_pos  #
        max_pos = max_pos_new

        if data.get('min_position'):
            min_position = data['min_position']
            print('min_position:', min_position)

        try:
            if cw is not None:
                if not cw.alive:
                    break
                cw.setTitle('{}  {} (@{}) - {}'.format(tr_('읽는 중...'), artist,
                                                       username, len(imgs)))
        except Exception as e:
            print(e)
            raise

    return imgs