def init(self): if u'bdsmlr.com/post/' in self.url: raise errors.Invalid( tr_(u'개별 다운로드는 지원하지 않습니다: {}').format(self.url)) self.url = 'https://{}.bdsmlr.com'.format(self.id_) self.session = Session() clf2.solve(self.url, session=self.session, cw=self.cw)
def init(self): self.url = self.url.replace('bdsmlr_', '') if u'bdsmlr.com/post/' in self.url: return self.Invalid(tr_(u'개별 다운로드는 지원하지 않습니다: {}').format(self.url), fail=False) self.url = 'https://{}.bdsmlr.com'.format(self.id_) self.session = Session() clf2.solve(self.url, session=self.session, cw=self.customWidget)
def init(self): cw = self.cw self.session = Session() res = clf2.solve(self.url, self.session, cw) soup = Soup(res['html']) if is_captcha(soup): def f(html): return not is_captcha(Soup(html)) clf2.solve(self.url, self.session, cw, show=True, f=f)
def read(self): if '/video/' in self.url: res = clf2.solve(self.url, session=self.session, cw=self.cw) soup = Soup(res['html']) title = soup.find('h1', id='post_title').text.strip() self.title = title view = soup.find('div', id='post') video = view.find('video') src = video.find('source')['src'] src = urljoin(self.url, src) video = Video(src, self.url, title, self.session) self.urls.append(video.url) self.single = True return if '/image/' not in self.url: raise NotImplementedError('Not a post') res = clf2.solve(self.url, session=self.session, cw=self.cw) soup = Soup(res['html']) title = soup.find('h2').text paginator = soup.find('div', id='paginator') pages = [self.url] for a in paginator.findAll('a'): href = a.get('href') if not href: continue href = urljoin(self.url, href) if href not in pages: pages.append(href) imgs = [] for i, page in enumerate(pages): if page == self.url: soup_page = soup else: soup_page = downloader.read_soup(page, session=self.session) view = soup_page.find('div', id='post') for img in view.findAll('img'): href = img.parent['href'] href = urljoin(page, href) img = Image(href, page, len(imgs), self.session) imgs.append(img) self.cw.setTitle('{} {} ({} / {})'.format(tr_('읽는 중...'), title, i+1, len(pages))) for img in imgs: self.urls.append(img.url) self.title = clean_title(title)
def get_id(url, cw=None): for try_ in range(2): try: res = clf2.solve(url, cw=cw, f=_get_page_id) html = res['html'] soup = Soup(html) if soup.find('div', class_='gn_login'): raise errors.LoginRequired() oid = _get_page_id(html) if not oid: raise Exception('no page_id') uids = re.findall(r'uid=([0-9]+)', html) uid = max(set(uids), key=uids.count) name = re.find(r"CONFIG\['onick'\]='(.+?)'", html) or soup.find( 'div', class_=lambda c: c and c.startswith('ProfileHeader_name' )).text.strip() if not name: raise Exception('no name') break except errors.LoginRequired as e: raise except Exception as e: e_ = e print(e) else: raise e_ return uid, oid, name
def get_soup(url, session=None): if session is None: session = Session() res = clf2.solve(url, session=session) soup = Soup(res['html'], apply_css=True) return session, soup, res['url']
def soup(self): if self._soup is None: res = clf2.solve(self.url, session=self.session) #4070 html = res['html'] soup = Soup(html) self._soup = soup return self._soup
def get_imgs_page(page, referer, session, cw): #sleep(2) #html = downloader.read_html(page.url, referer, session=session) #soup = Soup(html) # 2183 res = clf2.solve(page.url, session=session) soup = Soup(res['html']) views = soup.findAll('div', class_='view-content') imgs = [] for view in views: if view is None: continue for img in view.findAll('img'): img = img.attrs.get('data-original') or img.attrs.get('content') if not img: continue img = urljoin(page.url, img) if '/img/cang' in img: continue if '/img/blank.gif' in img: continue img = Image(img, page, len(imgs)) imgs.append(img) if not imgs: raise Exception('no imgs') return imgs
def get_soup_session(url, cw=None): print_ = get_print(cw) session = Session() res = clf2.solve(url, session=session, cw=cw) print_('{} -> {}'.format(url, res['url'])) if res['url'].rstrip('/') == 'https://welovemanga.one': raise errors.LoginRequired() return Soup(res['html']), session
def real_url(url, session=None, cw=None): print_ = get_print(cw) if session is None: session = Session() data = clf2.solve(url, session=session, cw=cw) url_new = data['url'] print('url_new:', url_new) if url_new != url: url_new = urljoin(url_new, '/' + u'/'.join(url.split('/')[3:])) # print_(u'[redirect domain] {} -> {}'.format(url, url_new)) return url_new
def read_html(url, session, cw): ## html = downloader.read_html(url, session=session) ## soup = Soup(html) ## ## cf = soup.find('div', class_='cf-browser-verification') ## if cf is None: ## return html r = clf2.solve(url, cw=cw, session=session) return r['html']
def fix_soup(soup, url, session=None, cw=None): ''' fix_soup ''' print_ = get_print(cw) if soup.find('div', class_='logo'): return soup print_('invalid soup: {}'.format(url)) res = clf2.solve(url, session=session, cw=cw) return Soup(res['html'])
def solve_protection(url, session, cw=None): print_ = get_print(cw) print_('Solve protection') r = clf2.solve(url, session=session, cw=cw) html = r['html'] # 1566 ''' session = clf2.Session(session) r = session.get(url) html = r.text ''' if constants.admin: with open('test_manamoa.html', 'w') as f: f.write(html.encode('utf8')) #html = read_html(page.url, session=session) return html
def get_session(url, cw=None): print_ = get_print(cw) ## html = downloader.read_html(url) ## soup = Soup(html) ## ## cf = soup.find('div', class_='cf-browser-verification') ## if cf is None: ## print_('no cf protection') ## return None print_('cf protection') r = clf2.solve(url, cw=cw) session = r['session'] return session
def get_soup(url, session=None): if session is None: session = Session() def f(html, browser=None): soup = Soup(html) if soup.find('form', {'name': 'fcaptcha'}): #4660 browser.show() return False browser.hide() return True res = clf2.solve(url, session=session, f=f) soup = Soup(res['html'], apply_css=True) return session, soup, res['url']
def get_imgs_page(page, referer, session, cw=None): print_ = get_print(cw) print_(page.title) html = downloader.read_html(page.url, referer, session=session) if clf2._is_captcha(Soup(html)): #4124 html = clf2.solve(page.url, session, cw)['html'] if not html: raise Exception('empty html') html = html.replace( '{}='.format(re.find(r"\$\(this\)\.attr\('(.+?)'", html, err='no cn')), 'data-src=') soup = Soup(html) view = soup.find('div', class_='chapter-content') if not view: raise Exception('no chapter-content') imgs = [] for img in soup.findAll('img', class_='chapter-img'): src = img.get('data-pagespeed-lazy-src') or img.get( 'data-src') or img.get('data-srcset') or img.get( 'data-aload') or img['src'] try: src = base64.b64decode(src).strip().decode('utf8') except: pass src0 = src src = src.replace('welovemanga.one', '1') # src = urljoin(page.url, src).strip() if 'Credit_LHScan_' in src or '5e1ad960d67b2_5e1ad962338c7' in src: continue if 'fe132b3d32acc39f5adcea9075bedad4LoveHeaven' in src: continue if 'LoveHug_600cfd96e98ff.jpg' in src: continue if 'image_5f0ecf23aed2e.png' in src: continue if '/uploads/lazy_loading.gif' in src: continue if not imgs: print_(src0) img = Image(src, page, len(imgs)) imgs.append(img) return imgs
def get_pages(url, session=None, soup=None): if soup is None: res = clf2.solve(url, session=session) #4070 soup = Soup(res['html']) pages = [] for inner in soup.findAll('div', class_='inner'): a = inner.find('a') if not a: continue href = a.attrs.get('href', '') if not re.search(PATTERN_ID, href): continue if a.find('img'): print('skip img', a.attrs.get('href')) continue href = urljoin(url, href) title_page = a.text page = Page(title_page, href) pages.append(page) pages = list(reversed(pages)) return pages
def get_id(url, cw=None): for try_ in range(2): try: res = clf2.solve(url, cw=cw, f=_get_page_id) html = res['html'] soup = Soup(html) if soup.find('div', class_='gn_login'): raise errors.LoginRequired() m = _get_page_id(html) if not m: raise Exception('no page_id') oid = m.groups()[0] uids = re.findall('uid=([0-9]+)', html) uid = max(set(uids), key=uids.count) name = re.findall("CONFIG\\['onick'\\]='(.+?)'", html)[0] break except errors.LoginRequired as e: raise except Exception as e: e_ = e print(e) else: raise e_ return uid, oid, name
def read_html(url, session=None, cw=None): r = clf2.solve(url, session=session, cw=cw) html = r['html'] return html
def get_session(url, cw=None): session = Session() clf2.solve(url, session=session, cw=cw) return session
def get_soup(url): session = Session() res = clf2.solve(url, session=session) soup = Soup(res['html']) return session, soup, res['url']
def init(self): self.session = clf2.solve(self.url)['session'] #4541
def read_channel(url, session, cw=None): print_ = get_print(cw) info = {} info['items'] = [] ids = set() info['items'] = [] sd = { 'count_empty': 0, 'shown': SHOW, } max_pid = get_max_range(cw) def f(html, browser=None): soup = Soup(html) if is_captcha(soup): print('captcha') browser.show() sd['shown'] = True elif sd['shown'] and not SHOW: browser.hide() sd['shown'] = False try: st = soup.find('h2', class_='share-title') if st is None: st = soup.find('h2', class_=lambda c: c and 'ShareTitle' in c) info['uid'] = st.text.strip() st = soup.find('h1', class_='share-sub-title') if st is None: st = soup.find('h1', class_=lambda c: c and 'ShareSubTitle' in c) info['nickname'] = st.text.strip() except Exception as e: print_(print_error(e)[0]) c = 0 ids_now = set() items = soup.findAll('div', class_='video-feed-item') + soup.findAll( 'div', class_=lambda c: c and 'DivItemContainer' in c) for div in items: a = div.find('a') if a is None: continue href = a['href'] if not href: continue m = re.search(PATTERN_VID, href) if m is None: continue id_video = int(m.group('id')) ids_now.add(id_video) if id_video in ids: continue ids.add(id_video) info['items'].append({'id': id_video}) c += 1 print_('items: {}'.format(len(info['items']))) if len(info['items']) >= max_pid: info['items'] = info['items'][:max_pid] return True browser.runJavaScript( 'window.scrollTo(0, document.body.scrollHeight);') sleep(15, cw) if c or (ids_now and min(ids_now) > min(ids)): sd['count_empty'] = 0 else: print_('empty') sd['count_empty'] += 1 msg = '{} {} (tiktok_{}) - {}'.format(tr_('읽는 중...'), info.get('nickname'), info.get('uid'), len(info['items'])) if cw: if not cw.alive: raise Exception('cw dead') cw.setTitle(msg) else: print(msg) return sd['count_empty'] > 4 res = clf2.solve(url, session, cw, f=f, timeout=1800, show=SHOW, delay=0) if not info['items']: raise Exception('no items') return info
def get_soup(url: str) -> BeautifulSoup: res = clf2.solve(url) return Soup(res["html"])
def solve_protection(url, session, cw=None): print_ = get_print(cw) print_('Solve protection') r = clf2.solve(url, session=session, cw=cw) html = r['html'] # 1566 return html
def get_soup(url: str): res = clf2.solve(url) return Soup(res["html"])
def read_html(url, session): res = clf2.solve(url, session=session) return res['html']