コード例 #1
0
    def init(self):
        if u'bdsmlr.com/post/' in self.url:
            raise errors.Invalid(
                tr_(u'개별 다운로드는 지원하지 않습니다: {}').format(self.url))

        self.url = 'https://{}.bdsmlr.com'.format(self.id_)
        self.session = Session()
        clf2.solve(self.url, session=self.session, cw=self.cw)
コード例 #2
0
 def init(self):
     self.url = self.url.replace('bdsmlr_', '')
     
     if u'bdsmlr.com/post/' in self.url:
         return self.Invalid(tr_(u'개별 다운로드는 지원하지 않습니다: {}').format(self.url), fail=False)
     
     self.url = 'https://{}.bdsmlr.com'.format(self.id_)
     self.session = Session()
     clf2.solve(self.url, session=self.session,  cw=self.customWidget)
コード例 #3
0
    def init(self):
        cw = self.cw
        self.session = Session()
        res = clf2.solve(self.url, self.session, cw)
        soup = Soup(res['html'])
        if is_captcha(soup):

            def f(html):
                return not is_captcha(Soup(html))

            clf2.solve(self.url, self.session, cw, show=True, f=f)
コード例 #4
0
    def read(self):
        if '/video/' in self.url:
            res = clf2.solve(self.url, session=self.session, cw=self.cw)
            soup = Soup(res['html'])
            title = soup.find('h1', id='post_title').text.strip()
            self.title = title
            view = soup.find('div', id='post')
            video = view.find('video')
            src = video.find('source')['src']
            src = urljoin(self.url, src)
            video = Video(src, self.url, title, self.session)
            self.urls.append(video.url)
            self.single = True
            return
        
        if '/image/' not in self.url:
            raise NotImplementedError('Not a post')

        res = clf2.solve(self.url, session=self.session, cw=self.cw)
        soup = Soup(res['html'])
        title = soup.find('h2').text
        paginator = soup.find('div', id='paginator')
        pages = [self.url]
        for a in paginator.findAll('a'):
            href = a.get('href')
            if not href:
                continue
            href = urljoin(self.url, href)
            if href not in pages:
                pages.append(href)

        imgs = []
        for i, page in enumerate(pages):
            if page == self.url:
                soup_page =  soup
            else:
                soup_page = downloader.read_soup(page, session=self.session)
            view = soup_page.find('div', id='post')
            for img in view.findAll('img'):
                href = img.parent['href']
                href = urljoin(page, href)
                img = Image(href, page, len(imgs), self.session)
                imgs.append(img)
            self.cw.setTitle('{} {} ({} / {})'.format(tr_('읽는 중...'), title, i+1, len(pages)))

        for img in imgs:
            self.urls.append(img.url)

        self.title = clean_title(title)
コード例 #5
0
def get_id(url, cw=None):
    for try_ in range(2):
        try:
            res = clf2.solve(url, cw=cw, f=_get_page_id)
            html = res['html']
            soup = Soup(html)
            if soup.find('div', class_='gn_login'):
                raise errors.LoginRequired()
            oid = _get_page_id(html)
            if not oid:
                raise Exception('no page_id')
            uids = re.findall(r'uid=([0-9]+)', html)
            uid = max(set(uids), key=uids.count)
            name = re.find(r"CONFIG\['onick'\]='(.+?)'", html) or soup.find(
                'div',
                class_=lambda c: c and c.startswith('ProfileHeader_name'
                                                    )).text.strip()
            if not name:
                raise Exception('no name')
            break
        except errors.LoginRequired as e:
            raise
        except Exception as e:
            e_ = e
            print(e)
    else:
        raise e_
    return uid, oid, name
コード例 #6
0
def get_soup(url, session=None):
    if session is None:
        session = Session()
    res = clf2.solve(url, session=session)
    soup = Soup(res['html'], apply_css=True)

    return session, soup, res['url']
コード例 #7
0
 def soup(self):
     if self._soup is None:
         res = clf2.solve(self.url, session=self.session)  #4070
         html = res['html']
         soup = Soup(html)
         self._soup = soup
     return self._soup
コード例 #8
0
def get_imgs_page(page, referer, session, cw):
    #sleep(2)
    #html = downloader.read_html(page.url, referer, session=session)
    #soup = Soup(html)

    # 2183
    res = clf2.solve(page.url, session=session)
    soup = Soup(res['html'])

    views = soup.findAll('div', class_='view-content')

    imgs = []
    for view in views:
        if view is None:
            continue
        for img in view.findAll('img'):
            img = img.attrs.get('data-original') or img.attrs.get('content')
            if not img:
                continue
            img = urljoin(page.url, img)
            if '/img/cang' in img:
                continue
            if '/img/blank.gif' in img:
                continue
            img = Image(img, page, len(imgs))
            imgs.append(img)

    if not imgs:
        raise Exception('no imgs')

    return imgs
コード例 #9
0
def get_soup_session(url, cw=None):
    print_ = get_print(cw)
    session = Session()
    res = clf2.solve(url, session=session, cw=cw)
    print_('{} -> {}'.format(url, res['url']))
    if res['url'].rstrip('/') == 'https://welovemanga.one':
        raise errors.LoginRequired()
    return Soup(res['html']), session
コード例 #10
0
def real_url(url, session=None, cw=None):
    print_ = get_print(cw)
    if session is None:
        session = Session()
    data = clf2.solve(url, session=session, cw=cw)
    url_new = data['url']
    print('url_new:', url_new)
    if url_new != url:
        url_new = urljoin(url_new, '/' + u'/'.join(url.split('/')[3:]))  #
        print_(u'[redirect domain] {} -> {}'.format(url, url_new))
    return url_new
コード例 #11
0
def read_html(url, session, cw):
    ##    html = downloader.read_html(url, session=session)
    ##    soup = Soup(html)
    ##
    ##    cf = soup.find('div', class_='cf-browser-verification')
    ##    if cf is None:
    ##        return html

    r = clf2.solve(url, cw=cw, session=session)

    return r['html']
コード例 #12
0
def fix_soup(soup, url, session=None, cw=None):
    '''
    fix_soup
    '''
    print_ = get_print(cw)
    if soup.find('div', class_='logo'):
        return soup
    print_('invalid soup: {}'.format(url))

    res = clf2.solve(url, session=session, cw=cw)

    return Soup(res['html'])
コード例 #13
0
def solve_protection(url, session, cw=None):
    print_ = get_print(cw)
    print_('Solve protection')
    r = clf2.solve(url, session=session, cw=cw)
    html = r['html']  # 1566
    '''
    session = clf2.Session(session)
    r = session.get(url)
    html = r.text
    '''
    if constants.admin:
        with open('test_manamoa.html', 'w') as f:
            f.write(html.encode('utf8'))
    #html = read_html(page.url, session=session)
    return html
コード例 #14
0
def get_session(url, cw=None):
    print_ = get_print(cw)
    ##    html = downloader.read_html(url)
    ##    soup = Soup(html)
    ##
    ##    cf = soup.find('div', class_='cf-browser-verification')
    ##    if cf is None:
    ##        print_('no cf protection')
    ##        return None

    print_('cf protection')
    r = clf2.solve(url, cw=cw)
    session = r['session']

    return session
コード例 #15
0
def get_soup(url, session=None):
    if session is None:
        session = Session()

    def f(html, browser=None):
        soup = Soup(html)
        if soup.find('form', {'name': 'fcaptcha'}):  #4660
            browser.show()
            return False
        browser.hide()
        return True

    res = clf2.solve(url, session=session, f=f)
    soup = Soup(res['html'], apply_css=True)

    return session, soup, res['url']
コード例 #16
0
def get_imgs_page(page, referer, session, cw=None):
    print_ = get_print(cw)
    print_(page.title)

    html = downloader.read_html(page.url, referer, session=session)
    if clf2._is_captcha(Soup(html)):  #4124
        html = clf2.solve(page.url, session, cw)['html']
    if not html:
        raise Exception('empty html')
    html = html.replace(
        '{}='.format(re.find(r"\$\(this\)\.attr\('(.+?)'", html, err='no cn')),
        'data-src=')
    soup = Soup(html)

    view = soup.find('div', class_='chapter-content')

    if not view:
        raise Exception('no chapter-content')

    imgs = []
    for img in soup.findAll('img', class_='chapter-img'):
        src = img.get('data-pagespeed-lazy-src') or img.get(
            'data-src') or img.get('data-srcset') or img.get(
                'data-aload') or img['src']
        try:
            src = base64.b64decode(src).strip().decode('utf8')
        except:
            pass
        src0 = src
        src = src.replace('welovemanga.one', '1')  #
        src = urljoin(page.url, src).strip()
        if 'Credit_LHScan_' in src or '5e1ad960d67b2_5e1ad962338c7' in src:
            continue
        if 'fe132b3d32acc39f5adcea9075bedad4LoveHeaven' in src:
            continue
        if 'LoveHug_600cfd96e98ff.jpg' in src:
            continue
        if 'image_5f0ecf23aed2e.png' in src:
            continue
        if '/uploads/lazy_loading.gif' in src:
            continue
        if not imgs:
            print_(src0)
        img = Image(src, page, len(imgs))
        imgs.append(img)

    return imgs
コード例 #17
0
def get_pages(url, session=None, soup=None):
    if soup is None:
        res = clf2.solve(url, session=session)  #4070
        soup = Soup(res['html'])
    pages = []
    for inner in soup.findAll('div', class_='inner'):
        a = inner.find('a')
        if not a:
            continue
        href = a.attrs.get('href', '')
        if not re.search(PATTERN_ID, href):
            continue
        if a.find('img'):
            print('skip img', a.attrs.get('href'))
            continue
        href = urljoin(url, href)
        title_page = a.text
        page = Page(title_page, href)
        pages.append(page)

    pages = list(reversed(pages))
    return pages
コード例 #18
0
def get_id(url, cw=None):
    for try_ in range(2):
        try:
            res = clf2.solve(url, cw=cw, f=_get_page_id)
            html = res['html']
            soup = Soup(html)
            if soup.find('div', class_='gn_login'):
                raise errors.LoginRequired()
            m = _get_page_id(html)
            if not m:
                raise Exception('no page_id')
            oid = m.groups()[0]
            uids = re.findall('uid=([0-9]+)', html)
            uid = max(set(uids), key=uids.count)
            name = re.findall("CONFIG\\['onick'\\]='(.+?)'", html)[0]
            break
        except errors.LoginRequired as e:
            raise
        except Exception as e:
            e_ = e
            print(e)
    else:
        raise e_
    return uid, oid, name
コード例 #19
0
def read_html(url, session=None, cw=None):
    r = clf2.solve(url, session=session, cw=cw)
    html = r['html']

    return html
コード例 #20
0
def get_session(url, cw=None):
    session = Session()
    clf2.solve(url, session=session, cw=cw)
    return session
コード例 #21
0
def get_soup(url):
    session = Session()
    res = clf2.solve(url, session=session)
    soup = Soup(res['html'])

    return session, soup, res['url']
コード例 #22
0
 def init(self):
     self.session = clf2.solve(self.url)['session']  #4541
コード例 #23
0
def read_channel(url, session, cw=None):
    print_ = get_print(cw)

    info = {}
    info['items'] = []

    ids = set()
    info['items'] = []
    sd = {
        'count_empty': 0,
        'shown': SHOW,
    }

    max_pid = get_max_range(cw)

    def f(html, browser=None):
        soup = Soup(html)
        if is_captcha(soup):
            print('captcha')
            browser.show()
            sd['shown'] = True
        elif sd['shown'] and not SHOW:
            browser.hide()
            sd['shown'] = False
        try:
            st = soup.find('h2', class_='share-title')
            if st is None:
                st = soup.find('h2', class_=lambda c: c and 'ShareTitle' in c)
            info['uid'] = st.text.strip()
            st = soup.find('h1', class_='share-sub-title')
            if st is None:
                st = soup.find('h1',
                               class_=lambda c: c and 'ShareSubTitle' in c)
            info['nickname'] = st.text.strip()
        except Exception as e:
            print_(print_error(e)[0])
        c = 0
        ids_now = set()
        items = soup.findAll('div', class_='video-feed-item') + soup.findAll(
            'div', class_=lambda c: c and 'DivItemContainer' in c)
        for div in items:
            a = div.find('a')
            if a is None:
                continue
            href = a['href']
            if not href:
                continue
            m = re.search(PATTERN_VID, href)
            if m is None:
                continue
            id_video = int(m.group('id'))
            ids_now.add(id_video)
            if id_video in ids:
                continue
            ids.add(id_video)
            info['items'].append({'id': id_video})
            c += 1

        print_('items: {}'.format(len(info['items'])))
        if len(info['items']) >= max_pid:
            info['items'] = info['items'][:max_pid]
            return True

        browser.runJavaScript(
            'window.scrollTo(0, document.body.scrollHeight);')
        sleep(15, cw)

        if c or (ids_now and min(ids_now) > min(ids)):
            sd['count_empty'] = 0
        else:
            print_('empty')
            sd['count_empty'] += 1
        msg = '{}  {} (tiktok_{}) - {}'.format(tr_('읽는 중...'),
                                               info.get('nickname'),
                                               info.get('uid'),
                                               len(info['items']))
        if cw:
            if not cw.alive:
                raise Exception('cw dead')
            cw.setTitle(msg)
        else:
            print(msg)
        return sd['count_empty'] > 4

    res = clf2.solve(url, session, cw, f=f, timeout=1800, show=SHOW, delay=0)

    if not info['items']:
        raise Exception('no items')

    return info
コード例 #24
0
def get_soup(url: str) -> BeautifulSoup:
    res = clf2.solve(url)
    return Soup(res["html"])
コード例 #25
0
def solve_protection(url, session, cw=None):
    print_ = get_print(cw)
    print_('Solve protection')
    r = clf2.solve(url, session=session, cw=cw)
    html = r['html']  # 1566
    return html
コード例 #26
0
def get_soup(url: str):
    res = clf2.solve(url)
    return Soup(res["html"])
コード例 #27
0
def read_html(url, session):
    res = clf2.solve(url, session=session)
    return res['html']