Esempio n. 1
0
def get_video(url, cw=None):
    print_ = get_print(cw)

    check_alive(cw)

    data = cw.data_
    version = data['version']
    print_('version: {}'.format(version))
    if version == '0.1':
        raise errors.OutdatedExtension()
    data = data['data']
    if not isinstance(data, bytes):
        data = data.encode('utf8')
    s = base64.b64decode(data).decode('utf8')
    urls = json.loads(s)

    print_(u'\n'.join(urls[:4]))

    referer_seg = 'auto' if 'referer=force' in urls[0] else None  # 1718

    stream = M3u8_stream(url, urls=urls, n_thread=4, referer_seg=referer_seg)

    html = downloader.read_html(url)
    soup = Soup(html)

    url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
    title = soup.find('meta', {
        'property': 'og:title'
    }).attrs['content'].strip()

    video = Video(stream, url_thumb, url, title)

    return video
Esempio n. 2
0
def process_ids(ids, info, imgs, cw, depth=0):
    print_ = get_print(cw)
    max_pid = get_max_range(cw)
    for i, id_illust in enumerate(ids):
        try:
            info_illust = get_info(
                'https://www.pixiv.net/en/artworks/{}'.format(id_illust),
                cw,
                depth=depth + 1)
        except Exception as e:
            if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました'
                               or type(e) == errors.LoginRequired
                               ):  # logout during extraction
                raise e
            print_('process_ids error ({}):\n{}'.format(
                depth,
                print_error(e)[0]))
            continue
        imgs += info_illust['imgs']
        s = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(imgs))
        if cw:
            cw.setTitle(s)
        else:
            print(s)
        if len(imgs) >= max_pid:
            break
        if depth == 0:
            check_alive(cw)
Esempio n. 3
0
def get_imgs_channel(url, html=None, cw=None):
    print_ = get_print(cw)
    if html is None:
        html = downloader.read_html(url)
    info = get_info(url, html)

    # Range
    max_pid = get_max_range(cw)

    ids = set()
    imgs = []
    for p in range(1000):
        url_api = 'https://bcy.net/apiv3/user/selfPosts?uid={}'.format(
            info['uid'])
        if imgs:
            url_api += '&since={}'.format(imgs[-1].id)
        data_raw = downloader.read_html(url_api, url)
        data = json.loads(data_raw)['data']
        items = data['items']
        if not items:
            print('no items')
            break
        c = 0
        for item in items:
            check_alive(cw)
            id = item['item_detail']['item_id']
            if id in ids:
                print('duplicate')
                continue
            c += 1
            ids.add(id)
            url_single = u'https://bcy.net/item/detail/{}'.format(id)
            imgs_single = get_imgs(url_single, cw=cw)
            print_(str(id))
            for p, img in enumerate(imgs_single):
                img = Image(img._url, url_single, id, p)
                imgs.append(img)
            s = u'{} {} - {}'.format(tr_(u'읽는 중...'), info['artist'],
                                     min(len(imgs), max_pid))
            if cw:
                cw.setTitle(s)
            else:
                print(s)

            if len(imgs) >= max_pid:
                break
        if not c:
            print('not c')
            break
        if len(imgs) >= max_pid:
            print('over max_pid:', max_pid)
            break
    return imgs[:max_pid]
    def read(self):
        file = None
        files = None
        title = None
        if '/users/' in self.url or '/user/' in self.url:
            type_ = 'videos'
            try:
                if self.url.split('/users/')[1].split('/')[1] == 'images':
                    type_ = 'images'
            except:
                pass
            info = read_channel(self.url, type_, self.session, self.cw)
            title = info['title']
            urls = info['urls']
            if type_ == 'videos':
                files = [LazyFile(url, type_, self.session) for url in urls]
                file = self.process_playlist('[Channel] [{}] {}'.format(type_.capitalize(), title), files)
            elif type_ == 'images': #4499
                files = []
                for i, url in enumerate(urls):
                    check_alive(self.cw)
                    files += get_files(url, self.session, multi_post=True, cw=self.cw) #4728
                    self.title = '{} {} - {} / {}'.format(tr_('읽는 중...'), title, i, len(urls))
                title = '[Channel] [{}] {}'.format(type_.capitalize(), title)
            else:
                raise NotImplementedError(type_)

        if file is None:
            if files is None:
                files = get_files(self.url, self.session, cw=self.cw)
            for file in files:
                self.urls.append(file.url)
            file = files[0]

            if file.type == 'youtube':
                raise errors.Invalid('[iwara] Youtube: {}'.format(self.url))
            
            if file.type == 'image':
                self.single = False
            title = title or file.title
            if not self.single:
                title = clean_title(title)
            self.title = title
            
        if file.thumb is not None:
            self.setIcon(file.thumb)
def read_channel(url, type_, session, cw=None):
    print_ = get_print(cw)
    html = read_html(url, session=session)
    soup = Soup(html)
    if soup.find('div', id='block-mainblocks-user-connect'):
        username = re.find(r'''/messages/new\?user=(.+)['"]''',
                           html,
                           err='no username')
    else:
        username = re.find(r'/users/([^/]+)', url, err='no username')
    print_('username: {}'.format(username))
    info = {}
    urls = []
    urls_set = set()
    for p in range(50):
        check_alive(cw)
        url = 'https://ecchi.iwara.tv/users/{}/{}?page={}'.format(
            username, type_, p)
        print_(url)
        html = read_html(url, session=session)
        soup = Soup(html)
        if p == 0:
            title = soup.find('h1', class_='page-title').text
            info['title'] = title.replace("'s videos",
                                          '').replace("'s images", '').strip()

        view = soup.find('div', class_='view-content')
        if view is None:
            break

        urls_new = []
        for div in view.findAll('div', class_='views-column'):
            href = div.find('a')['href']
            url_video = urljoin(url, href)
            if url_video in urls_set:
                continue
            urls_set.add(url_video)
            urls_new.append(url_video)
        if not urls_new:
            break
        urls += urls_new
    info['urls'] = urls
    return info
Esempio n. 6
0
def get_imgs_all(info, title, session, cw=None):
    print_ = get_print(cw)
    pages = info['pages']
    pages = page_selector.filter(pages, cw)
    imgs = []
    for p, page in enumerate(pages):
        imgs_already = get_imgs_already('daumtoon', title, page, cw)
        if imgs_already:
            imgs += imgs_already
            continue
        try:
            imgs += get_imgs(page, session, cw)
        except NotPaidError:
            print_('Not paid: {}'.format(page.title))  #3314
            continue
        if cw is not None:
            cw.setTitle(
                tr_(u'\uc77d\ub294 \uc911... {} / {}  ({}/{})').format(
                    title, page.title, p + 1, len(pages)))
        check_alive(cw)

    return imgs
Esempio n. 7
0
def get_ids(q, popular, cw):
    check_alive(cw)
    if q is None:
        if popular:
            url_api = 'https://j.nozomi.la/index-Popular.nozomi'
        else:
            url_api = 'https://j.nozomi.la/index.nozomi'
    else:
        if popular:
            url_api = 'https://j.nozomi.la/nozomi/popular/{}-Popular.nozomi'.format(
                quote(q))
        else:
            url_api = 'https://j.nozomi.la/nozomi/{}.nozomi'.format(quote(q))
    print(url_api)
    f = BytesIO()
    downloader.download(url_api, referer='https://nozomi.la/', buffer=f)
    data = f.read()
    ids = []
    for i in range(0, len(data), 4):
        crop = data[i:i + 4]
        id = crop[0] * 16777216 + crop[1] * 65536 + crop[2] * 256 + crop[3]
        ids.append(id)
    return ids
Esempio n. 8
0
def get_info(url, cw=None, depth=0):
    print_ = get_print(cw)
    api = PixivAPI()
    info = {}
    imgs = []

    if utils.ui_setting:
        ugoira_ext = [None, '.gif', '.webp',
                      '.png'][utils.ui_setting.ugoira_convert.currentIndex()]
    else:
        ugoira_ext = None
    if utils.ui_setting:
        format_ = compatstr(utils.ui_setting.pixivFormat.currentText())
    else:
        format_ = 'id_ppage'

    max_pid = get_max_range(cw)

    if api.illust_id(url):  # Single post
        id_ = api.illust_id(url)
        data = api.illust(id_)
        login = '******' not in data
        if FORCE_LOGIN and not login:  #
            raise errors.LoginRequired()
        if data['xRestrict'] and not login:
            raise errors.LoginRequired('R-18')
        info['artist'] = data['userName']
        info['artist_id'] = data['userId']
        info['raw_title'] = data['illustTitle']
        info['title'] = '{} (pixiv_illust_{})'.format(info['raw_title'], id_)
        info['create_date'] = parse_time(data['createDate'])
        tags_illust = set(tag['tag'] for tag in data['tags']['tags'])

        if tags_matched(tags_illust, cw):
            if data['illustType'] == 2:  # ugoira
                data = api.ugoira_meta(id_)
                ugoira = {
                    'ext': ugoira_ext,
                    'delay': [frame['delay'] for frame in data['frames']],
                }
                img = Image(data['originalSrc'],
                            url,
                            id_,
                            0,
                            format_,
                            info,
                            cw,
                            ugoira=ugoira)
                imgs.append(img)
            else:
                data = api.pages(id_)
                for img in data:
                    img = Image(img['urls']['original'], url, id_, len(imgs),
                                format_, info, cw)
                    imgs.append(img)
        else:
            print('tags mismatched')
    elif '/bookmarks/' in url or 'bookmark.php' in url:  # User bookmarks
        id_ = api.user_id(url)
        if id_ is None:  #
            id_ = my_id()
        process_user(id_, info, api)
        info['title'] = '{} (pixiv_bmk_{})'.format(info['artist'],
                                                   info['artist_id'])
        ids = []
        ids_set = set()
        offset = 0
        while len(ids) < max_pid:
            data = api.bookmarks(id_, offset)
            c = 0
            for id in [work['id'] for work in data['works']]:
                if id in ids_set:
                    continue
                ids_set.add(id)
                ids.append(id)
                c += 1
            if not c:
                break
            offset += LIMIT
            if depth == 0:
                check_alive(cw)
        process_ids(ids[:max_pid], info, imgs, cw, depth)
    elif '/tags/' in url or 'search.php' in url:  # Search
        q = unquote(
            re.find(r'/tags/([^/]+)', url)
            or re.find('[?&]word=([^&]*)', url, err='no tags'))
        info['title'] = '{} (pixiv_search_{})'.format(q, q.replace(' ', '+'))
        qs = query_url(url)
        order = qs.get('order', ['date_d'])[0]
        mode = qs.get('mode', ['all'])[0]
        ids = []
        ids_set = set()
        p = 1
        while len(ids) < max_pid:
            data = api.search(q, order, mode, p=p)
            c = 0
            for id in [
                    illust['id'] for illust in data['illustManga']['data']
                    if 'id' in illust
            ]:
                if id in ids_set:
                    continue
                ids_set.add(id)
                ids.append(id)
                c += 1
            if not c:
                break
            p += 1
        process_ids(ids[:max_pid], info, imgs, cw, depth)
    elif 'bookmark_new_illust.php' in url or 'bookmark_new_illust_r18.php' in url:  # Newest works: Following
        r18 = 'bookmark_new_illust_r18.php' in url
        id_ = my_id()
        process_user(id_, info, api)
        info['title'] = '{} (pixiv_following_{}{})'.format(
            info['artist'], 'r18_' if r18 else '', info['artist_id'])
        ids = []
        ids_set = set()
        p = 1
        while len(ids) < max_pid:
            c = 0
            for id in api.following(p, r18=r18):
                if id in ids_set:
                    continue
                ids_set.add(id)
                ids.append(id)
                c += 1
            if not c:
                break
            p += 1
        process_ids(ids[:max_pid], info, imgs, cw, depth)
    elif api.user_id(url):  # User illusts
        id_ = api.user_id(url)
        process_user(id_, info, api)
        data = api.profile(id_)
        info['title'] = '{} (pixiv_{})'.format(info['artist'],
                                               info['artist_id'])
        ids = []
        for illusts in [data['illusts'], data['manga']]:
            if not illusts:
                continue
            ids += list(illusts.keys())
        ids = sorted(ids, key=int, reverse=True)
        process_ids(ids[:max_pid], info, imgs, cw, depth)
    else:
        raise NotImplementedError()
    info['imgs'] = imgs[:max_pid]

    return info
def wait(cw):
    check_alive(cw)
def process_ids(ids, info, imgs, cw, depth=0, tags_add=None):
    print_ = get_print(cw)
    max_pid = get_max_range(cw)

    class Thread(threading.Thread):
        alive = True
        rem = 0

        def __init__(self, queue):
            super().__init__(daemon=True)
            self.queue = queue

        @classmethod
        @lock
        def add_rem(cls, x):
            cls.rem += x

        def run(self):
            while self.alive:
                try:
                    id_, res, i = self.queue.popleft()
                except Exception as e:
                    sleep(.1)
                    continue
                try:
                    info_illust = get_info(
                        'https://www.pixiv.net/en/artworks/{}'.format(id_),
                        cw,
                        depth=depth + 1,
                        tags_add=tags_add)
                    res[i] = info_illust['imgs']
                except Exception as e:
                    if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました'
                                       or type(e) == errors.LoginRequired
                                       ):  # logout during extraction
                        res[i] = e
                    print_('process_ids error (id: {}, d:{}):\n{}'.format(
                        id_, depth,
                        print_error(e)[0]))
                finally:
                    Thread.add_rem(-1)

    queue = deque()
    n, step = Downloader_pixiv.STEP
    print_('{} / {}'.format(n, step))
    ts = []
    for i in range(n):
        t = Thread(queue)
        t.start()
        ts.append(t)
    for i in range(0, len(ids), step):
        res = [[]] * step
        for j, id_illust in enumerate(ids[i:i + step]):
            queue.append((id_illust, res, j))
            Thread.add_rem(1)
        while Thread.rem:
            sleep(.001, cw)
        for imgs_ in res:
            if isinstance(imgs_, Exception):
                raise imgs_
            imgs += imgs_
        s = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(imgs))
        if cw:
            cw.setTitle(s)
        else:
            print(s)
        if len(imgs) >= max_pid:
            break
        if depth == 0:
            check_alive(cw)
    for t in ts:
        t.alive = False
def get_info(url, cw=None, depth=0, tags_add=None):
    print_ = get_print(cw)
    api = PixivAPI()
    info = {}
    imgs = []

    ugoira_ext = [None, '.gif', '.webp', '.png'
                  ][utils.ui_setting.ugoira_convert.currentIndex(
                  )] if utils.ui_setting else None
    format_ = compatstr(utils.ui_setting.pixivFormat.currentText()
                        ) if utils.ui_setting else 'id_ppage'

    max_pid = get_max_range(cw)

    if api.illust_id(url):  # Single post
        id_ = api.illust_id(url)
        data = api.illust(id_)
        login = '******' not in data
        if FORCE_LOGIN and not login:  #
            raise errors.LoginRequired()
        if data['xRestrict'] and not login:
            raise errors.LoginRequired('R-18')
        info['artist'] = data['userName']
        info['artist_id'] = data['userId']
        info['raw_title'] = data['illustTitle']
        info['title'] = '{} (pixiv_illust_{})'.format(info['raw_title'], id_)
        info['create_date'] = parse_time(data['createDate'])
        tags_illust = set(tag['tag'] for tag in data['tags']['tags'])

        if tags_matched(tags_illust, tags_add, cw):
            if data['illustType'] == 2:  # ugoira
                data = api.ugoira_meta(id_)
                ugoira = {
                    'ext': ugoira_ext,
                    'delay': [frame['delay'] for frame in data['frames']],
                }
                img = Image(data['originalSrc'],
                            url,
                            id_,
                            0,
                            format_,
                            info,
                            cw,
                            ugoira=ugoira)
                imgs.append(img)
            else:
                data = api.pages(id_)
                for img in data:
                    img = Image(img['urls']['original'], url, id_, len(imgs),
                                format_, info, cw)
                    imgs.append(img)
        else:
            print('tags mismatched')
    elif '/bookmarks/' in url or 'bookmark.php' in url:  # User bookmarks
        id_ = api.user_id(url)
        if id_ is None:  #
            id_ = my_id()
        if id_ == my_id():
            rests = ['show', 'hide']
        else:
            rests = ['show']
        process_user(id_, info, api)
        info['title'] = '{} (pixiv_bmk_{})'.format(info['artist'],
                                                   info['artist_id'])
        ids = []
        ids_set = set()
        for rest in rests:
            offset = 0
            while len(ids) < max_pid:
                data = api.bookmarks(id_, offset, rest=rest)
                c = 0
                for id in [work['id'] for work in data['works']]:
                    if id in ids_set:
                        continue
                    ids_set.add(id)
                    ids.append(id)
                    c += 1
                if not c:
                    break
                offset += LIMIT
                if depth == 0:
                    check_alive(cw)
        process_ids(ids, info, imgs, cw, depth)
    elif '/tags/' in url or 'search.php' in url:  # Search
        q = unquote(
            re.find(r'/tags/([^/]+)', url)
            or re.find('[?&]word=([^&]*)', url, err='no tags'))
        info['title'] = '{} (pixiv_search_{})'.format(q, q.replace(' ', '+'))
        qs = query_url(url)
        order = qs.get('order', ['date_d'])[0]
        mode = qs.get('mode', ['all'])[0]
        s_mode = qs.get('s_mode', ['s_tag_full'])[0]
        scd = qs.get('scd', [None])[0]
        ecd = qs.get('ecd', [None])[0]
        type_ = qs.get('type', ['all'])[0]
        wlt = qs.get('wlt', [None])[0]
        wgt = qs.get('wgt', [None])[0]
        hlt = qs.get('hlt', [None])[0]
        hgt = qs.get('hgt', [None])[0]
        blt = qs.get('blt', [None])[0]
        bgt = qs.get('bgt', [None])[0]
        ratio = qs.get('ratio', [None])[0]
        tool = qs.get('tool', [None])[0]
        logs = [
            'order: {}'.format(order),
            'mode: {}'.format(mode),
            's_mode: {}'.format(s_mode),
            'scd / ecd: {} / {}'.format(scd, ecd),
            'type: {}'.format(type_),
            'wlt /  wgt: {} / {}'.format(wlt, wgt),
            'hlt / hgt: {} / {}'.format(hlt, hgt),
            'blt / bgt: {} / {}'.format(blt, bgt),
            'ratio: {}'.format(ratio),
            'tool: {}'.format(tool),
        ]
        print_('\n'.join(logs))
        ids = []
        ids_set = set()
        p = 1
        while len(ids) < max_pid:
            data = api.search(q,
                              order,
                              mode,
                              p=p,
                              s_mode=s_mode,
                              scd=scd,
                              ecd=ecd,
                              type_=type_,
                              wlt=wlt,
                              wgt=wgt,
                              hlt=hlt,
                              hgt=hgt,
                              blt=blt,
                              bgt=bgt,
                              ratio=ratio,
                              tool=tool)
            c = 0
            for id in [
                    illust['id'] for illust in data['illustManga']['data']
                    if 'id' in illust
            ]:
                if id in ids_set:
                    continue
                ids_set.add(id)
                ids.append(id)
                c += 1
            if not c:
                break
            p += 1
        process_ids(ids, info, imgs, cw, depth)
    elif 'bookmark_new_illust.php' in url or 'bookmark_new_illust_r18.php' in url:  # Newest works: Following
        r18 = 'bookmark_new_illust_r18.php' in url
        id_ = my_id()
        process_user(id_, info, api)
        info['title'] = '{} (pixiv_following_{}{})'.format(
            info['artist'], 'r18_' if r18 else '', info['artist_id'])
        ids = []
        ids_set = set()
        p = 1
        while len(ids) < max_pid:
            data = api.following(p, r18=r18)
            c = 0
            for id in data['page']['ids']:
                if id in ids_set:
                    continue
                ids_set.add(id)
                ids.append(id)
                c += 1
            if not c:
                break
            p += 1
        process_ids(ids, info, imgs, cw, depth)
    elif api.user_id(url):  # User illusts
        m = re.search(r'/users/[0-9]+/([\w]+)/?([^\?#/]*)', url)
        type_ = {
            'illustrations': 'illusts',
            'manga': 'manga'
        }.get(m and m.groups()[0])
        if type_:
            types = [type_]
        else:
            types = ['illusts', 'manga']
        if m:
            tag = unquote(m.groups()[1]) or None
        else:
            tag = None
        print_('types: {}, tag: {}'.format(types, tag))

        id_ = api.user_id(url)
        process_user(id_, info, api)
        data = api.profile(id_)
        info['title'] = '{} (pixiv_{})'.format(info['artist'],
                                               info['artist_id'])

        ids = []
        for type_ in types:
            illusts = data[type_]
            if not illusts:
                continue
            ids += list(illusts.keys())
        ids = sorted(ids, key=int, reverse=True)
        if not ids:
            raise Exception('no imgs')
        process_ids(ids,
                    info,
                    imgs,
                    cw,
                    depth,
                    tags_add=[tag] if tag else None)
    else:
        raise NotImplementedError()
    info['imgs'] = imgs[:max_pid]

    return info
Esempio n. 12
0
def get_imgs_more(username,
                  session,
                  title,
                  types,
                  n=None,
                  format='[%y-%m-%d] id_ppage',
                  cw=None,
                  mode='media',
                  method='tab',
                  imgs=None):
    print_ = get_print(cw)
    imgs = imgs or []
    print_('imgs: {}, types: {}'.format(len(imgs), ', '.join(types)))

    artist, username = get_artist_username(username, session, cw)  #

    # Range
    n = max(n or 0, get_max_range(cw))

    ids_set = set(img.id for img in imgs)

    count_no_tweets = 0
    count_no_imgs = 0

    while check_alive(cw) or len(imgs) < n:
        if options.get('experimental') or count_no_tweets:  #2687, #3392
            filter_ = ''
        else:
            filter_ = ' filter:media'
        cache_guest_token = bool(count_no_tweets)

        if ids_set:
            max_id = min(ids_set) - 1
            q = 'from:{} max_id:{} exclude:retweets{} -filter:periscope'.format(
                username, max_id, filter_)
        else:
            q = 'from:{} exclude:retweets{} -filter:periscope'.format(
                username, filter_)
        print(q)

        tweets = []
        for tweet in list(
                TwitterAPI(session, cw, cache_guest_token).search(q)):
            id = int(tweet['id'])
            if id in ids_set:
                print_('duplicate: {}'.format(id))
                continue
            ids_set.add(id)
            tweets.append(tweet)

        if tweets:
            exists_more_imgs = False
            for tweet in tweets:
                imgs_tweet = get_imgs_from_tweet(tweet, session, types, format,
                                                 cw)
                if imgs_tweet:
                    imgs += imgs_tweet
                    exists_more_imgs = True
            if exists_more_imgs:
                count_no_imgs = 0
            else:
                count_no_imgs += 1
                if count_no_imgs >= RETRY_MORE_IMGS:  #4130
                    break
            count_no_tweets = 0
        else:
            count_no_tweets += 1
            change_ua(session)
            if count_no_tweets >= RETRY_MORE:
                break
            print_('retry... {}'.format(count_no_tweets))
            continue

        msg = '{}  {} (@{}) - {}'.format(tr_('읽는 중...'), artist, username,
                                         len(imgs))
        if cw:
            cw.setTitle(msg)
        else:
            print(msg)

    return imgs
def get_imgs(username,
             session,
             title,
             types,
             n=0,
             format='[%y-%m-%d] id_ppage',
             cw=None):
    print_ = get_print(cw)

    # Range
    n = max(n, get_max_range(cw))

    # 2303
    ids = set()
    names = dict()
    dir_ = os.path.join(get_outdir('twitter'), title)
    if os.path.isdir(dir_) and cw:
        for name in cw.names_old:
            name = os.path.basename(name)
            id_ = re.find('([0-9]+)_p', name)
            if id_ is None:
                continue
            if get_ext(name).lower() == '.mp4':
                type_ = 'video'
            else:
                type_ = 'img'
            if type_ not in types:
                continue
            id_ = int(id_)
            ids.add(id_)
            if id_ in names:
                names[id_].append(name)
            else:
                names[id_] = [name]
    ids_sure = sorted(ids)[:-100]
    max_id = max(ids_sure) if ids_sure else 0  #3201

    # 2303
    imgs_old = []
    for id_ in sorted(ids, reverse=True):
        for p, file in enumerate(
                sorted(os.path.join(dir_, name) for name in names[id_])):
            img = Image(file, '', id_, 0, p, format, cw, False)
            img.url = LazyUrl_twitter(None, lambda _: file, img)
            img.filename = os.path.basename(file)
            imgs_old.append(img)

    imgs_new = []
    enough = False
    c_old = 0
    for tweet in TwitterAPI(session, cw).timeline_media(username):
        check_alive(cw)
        id_ = int(tweet['id_str'])
        if id_ < max_id:
            print_('enough')
            enough = True
            break

        if id_ in ids:
            print_('duplicate: {}'.format(id_))
            c_old += 1
            continue
        ids.add(id_)

        imgs_new += get_imgs_from_tweet(tweet, session, types, format, cw)

        if len(imgs_new) + c_old >= n:  #3201
            break

        msg = '{}  {} - {}'.format(tr_('읽는 중...'), title, len(imgs_new))
        if cw:
            cw.setTitle(msg)
        else:
            print(msg)

    if not enough and not imgs_new and c_old == 0:
        raise Exception('no imgs')

    imgs = sorted(imgs_old + imgs_new, key=lambda img: img.id, reverse=True)

    if len(imgs) < n:
        imgs = get_imgs_more(username,
                             session,
                             title,
                             types,
                             n,
                             format,
                             cw,
                             imgs=imgs)

    return imgs[:n]
Esempio n. 14
0
def get_imgs(url, title=None, range_=None, cw=None):
    if 'donmai.us/artists' in url:
        raise NotImplementedError('Not Implemented')
    if 'donmai.us/posts/' in url:
        raise NotImplementedError('Not Implemented')

    print_ = get_print(cw)

    # Range
    max_pid = get_max_range(cw)

    if range_ is None:
        range_ = range(1, 1001)
    print(range_)
    imgs = []
    i = 0
    empty_count = 0
    empty_count_global = 0
    url_imgs = set()
    while i < len(range_):
        check_alive(cw)
        p = range_[i]
        url = setPage(url, p)
        print_(url)
        soup = read_soup(url, cw)
        articles = soup.findAll('article')
        if articles:
            empty_count_global = 0
        else:
            empty_count += 1
            if empty_count < 4:
                s = 'empty page; retry... {}'.format(p)
                print_(s)
                continue
            else:
                empty_count = 0
                empty_count_global += 1

        if empty_count_global >= 6:
            break

        for article in articles:
            id = article.attrs['data-id']

            #url_img = article.attrs['data-file-url'].strip()
            url_img = urljoin(
                url,
                article.find('a', class_='post-preview-link')['href'])  #4160

            #print(url_img)
            if url_img not in url_imgs:
                url_imgs.add(url_img)
                img = Image(id, url_img, cw)
                imgs.append(img)

        if len(imgs) >= max_pid:
            break

        if cw is not None:
            cw.setTitle('{}  {} - {}'.format(tr_('읽는 중...'), title, len(imgs)))
        i += 1

    return imgs[:max_pid]
Esempio n. 15
0
def read_soup(url, cw):
    check_alive(cw)
    wait(cw)
    return downloader.read_soup(url)
Esempio n. 16
0
def get_imgs(url, n_max=2000, title=None, cw=None, session=None):
    print_ = get_print(cw)

    for try_ in range(4):
        try:
            html = read_html(url, session, cw)
            m = re.search('"edge_owner_to_timeline_media":{"count":([0-9]+)',
                          html)
            if m is None:
                raise Exception('Invalid page')
            break
        except Exception as e:
            e_ = e
            print_(print_error(e)[0])
    else:
        raise e_
    n = int(m.groups()[0])
    n = min(n, n_max)

    data = get_sd(url, html=html, cw=cw)

    uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
    csrf_token = data['config']['csrf_token']  #
    session.cookies.set(name='ig_pr',
                        value='1',
                        path='/',
                        domain='.instagram.com')

    cursor = ''
    edges = []
    bad = 0
    while True:
        check_alive(cw)

        variables = {
            'id': uploader_id,
            'first': 12,
        }
        if cursor:
            variables['after'] = cursor
        #print_(variables)#

        media = None
        try:
            j = get_query('003056d32c2554def87228bc3fd9668a', variables,
                          session, cw)
            media = j['data']['user']['edge_owner_to_timeline_media']
            sleep(2)  #
        except Exception as e:
            if bad > 10:
                raise Exception('no media')
            else:
                print_(u'no media.. retry... ({}) {}'.format(
                    bad + 1,
                    print_error(e)[0]))
                sleep(12 * bad, cw)
                bad += 1
                continue
        bad = 0

        edges_new = media.get('edges')
        if not edges_new or not isinstance(edges_new, list):
            print('no edges_new')
            break

        edges += edges_new

        s = u'{} {}  ({}/{})'.format(tr_(u'읽는 중...'), title, len(edges), n)
        if cw is not None:
            cw.setTitle(s)
            if not cw.alive:
                return []
        else:
            print(s)

        if len(edges) >= n:
            break

        page_info = media.get('page_info')
        if not page_info:
            break
        if not page_info.get('has_next_page'):
            break
        cursor = page_info.get('end_cursor')
        if not cursor:
            break

    if len(edges) <= n / 2:
        raise Exception(u'Too short: {} / {}'.format(len(edges), n))

    imgs = []
    for edge in edges:
        node = edge['node']
        type = node['__typename']
        id = node['shortcode']
        url = u'https://www.instagram.com/p/{}/'.format(id)
        ##        if type in ['GraphVideo', 'GraphImage']:
        ##            single = True
        ##        else:
        ##            single = False
        for img in Node(url, session=session, cw=cw, media=node).imgs:
            imgs.append(img)
        if len(imgs) >= n_max:
            break

    return imgs