def get_score(self): score_url = 'http://219.242.68.33/xuesheng/cjcx.aspx' soup = Soup(self.http_request.session, score_url) all_scoreifo = [item.text.strip() for item in soup.find_all('td')] indexs = all_scoreifo[0::10] years = all_scoreifo[2::10] terms = all_scoreifo[3::10] units = all_scoreifo[5::10] natures = all_scoreifo[7::10] courses = all_scoreifo[8::10] scores = map(lambda x: ' / '.join(x), [item.split('\n') for item in all_scoreifo[9::10]]) average = soup.find(id="ctl00_ContentPlaceHolder1_lblpjcj").text total = soup.find(id="ctl00_ContentPlaceHolder1_lblKcms").text credit = soup.find( id="ctl00_ContentPlaceHolder1_lblXfs").text tabletitle = ['序号', '课程', '成绩', '学分', '学年', '学期', '性质'] conts = [] for index, year, term, unit, nature, course, score in \ zip(indexs, years, terms, units, natures, courses, scores): temp = [index, course.strip(), score.replace('\n', ''), unit, year, term, nature] conts.append(temp) if self.display: table_print(tabletitle, conts) table_print(['平均成绩','课程门数', '已获得学分'], [[average, total, credit]]) return conts
def get_content(self): if self.content != "" and self.type == "blog": return self.content soup = Soup(self.url) # extract blog content self.content += soup.find("div", {"class":"content_body"}).get_text() return self.content
def trending(self): all_trending = [] # get soup soup = Soup(self.BASE_URL + "/blog") # locate the html tags for a in soup.find("nav", {"class" : "blog-trending"}).findAll("a"): # construct blog object i = Item(self.BASE_URL + a.get("href"), a.get_text()) i.type = "blog" all_trending.append(i) return all_trending
def newest(self): newest_posts = [] # compose url url = self.BASE_URL + BLOG soup = Soup(url) a_tags = soup.find("div", {"class":"blog_posts-list"}).findAll("a") i = 0 for a_tag in a_tags: url = self.BASE_URL + a_tag.get("href") title = a_tag.get_text() if i %10 == 1: item = Item(url, title) item.type = "blog" newest_posts.append(item) i += 1 return newest_posts
def get_imgs(url, title=None, cw=None): url = clean_url(url) if 's=view' in url and 'page=favorites' not in url: raise NotImplementedError('Not Implemented') if 'page=dapi' not in url.lower(): tags = get_tags(url) tags = quote(tags, safe='/') tags = tags.replace('%20', '+') url = "https://rule34.xxx/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format( tags, 0, LIMIT) print_ = get_print(cw) # Range max_pid = get_max_range(cw) imgs = [] ids = set() for p in range(500): #1017 url = setPage(url, p) print_(url) html = try_n(4, sleep=30)(downloader.read_html)(url) #3340 soup = Soup(html) posts = soup.findAll('post') if not posts: break for post in posts: id_ = post.attrs['id'] if id_ in ids: print('duplicate:', id_) continue ids.add(id_) url_img = post.attrs['file_url'] img = Image(id_, url_img) imgs.append(img) if len(imgs) >= max_pid: break if cw is not None: if not cw.alive: break cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))) return imgs
def topic(self, topic): if topic not in topics: return "Topic not Found" posts = [] url = topics[topic] soup = Soup(url) a_tags = soup.find("div", {"class":"blog_posts-list"}).findAll("a") i = 0 for a_tag in a_tags: url = self.BASE_URL + a_tag.get("href") title = a_tag.get_text() if i %10 == 1: item = Item(url, title) item.type = "blog" posts.append(item) i += 1 return posts
def topic(self, topic): if topic not in topics: return "Topic not Found" posts = [] url = topics[topic] soup = Soup(url) a_tags = soup.find("div", {"class": "blog_posts-list"}).findAll("a") i = 0 for a_tag in a_tags: url = self.BASE_URL + a_tag.get("href") title = a_tag.get_text() if i % 10 == 1: item = Item(url, title) item.type = "blog" posts.append(item) i += 1 return posts
def init(self): type = self.url.split('sankakucomplex.com')[0].split('//')[-1].strip('.').split('.')[-1] if type == '': type = 'www' if type not in ['chan', 'idol', 'www']: raise Exception('Not supported subdomain') self.type_sankaku = type self.url = self.url.replace('&commit=Search', '') self.url = clean_url(self.url) self.session = Session() if self.type_sankaku != 'www': login(type, self.session, self.cw) if self.type_sankaku == 'www': html = downloader.read_html(self.url, session=self.session) self.soup = Soup(html)
def get_soup_session(url, cw=None): print_ = get_print(cw) session = Session() res = clf2.solve(url, session=session, cw=cw) print_('{} -> {}'.format(url, res['url'])) if res['url'].rstrip('/') == 'https://welovemanga.one': raise errors.LoginRequired() return Soup(res['html']), session
def init(self): self.url = self.url.replace('sankaku_', '') if '/post/' in self.url: return self.Invalid('Single post is not supported') if 'sankakucomplex.com' in self.url: self.url = self.url.replace('http://', 'https://') type = self.url.split('sankakucomplex.com')[0].split( '//')[-1].strip('.').split('.')[-1] if type == '': type = 'www' if type not in ['chan', 'idol', 'www']: raise Exception('Not supported subdomain') else: url = self.url url = url.replace(' ', '+') while '++' in url: url = url.replace('++', '+') url = urllib.quote(url) url = url.replace('%2B', '+') url = url.replace('%20', '+') # if url.startswith('[chan]'): type = 'chan' url = url.replace('[chan]', '', 1).strip() elif url.startswith('[idol]'): type = 'idol' url = url.replace('[idol]', '', 1).strip() elif url.startswith('[www]'): type = 'www' url = url.replace('[www]', '', 1).strip() else: raise Exception('Not supported subdomain') self.url = u'https://{}.sankakucomplex.com/?tags={}'.format( type, url) self.type_sankaku = type self.url = self.url.replace('&commit=Search', '') self.url = clean_url(self.url) self.session = Session() if self.type_sankaku != 'www': login(type, self.session, self.customWidget) if self.type_sankaku == 'www': html = downloader.read_html(self.url, session=self.session) self.soup = Soup(html)
def get_info(url, html): soup = Soup(html) info = {} uname = soup.find('div', class_='user-name') or soup.find( 'p', class_='uname') or soup.find('div', class_='user-info-name') info['artist'] = uname.text.strip() j = get_ssr_data(html) if '/detail/' in url: info['uid'] = j['detail']['detail_user']['uid'] info['id'] = j['detail']['post_data']['item_id'] else: info['uid'] = j['homeInfo']['uid'] return info
def fix_url(url, cw=None): print_ = get_print(cw) if '?' in url: tail = url.split('?')[1] else: tail = None html = downloader.read_html(url, methods={'requests'}) soup = Soup(html) meta = soup.find('meta', {'itemprop': 'url'}) if meta: url_new = meta.attrs['content'] if tail: url_new = u'{}?{}'.format(url_new, tail) print_(u'redirect: {} -> {}'.format(url, url_new)) else: url_new = url print_(u'no redirect') return url_new
def get_info(url, html): soup = Soup(html) info = {} uname = soup.find('div', class_='user-name') or soup.find('p', class_='uname') or soup.find('div', class_='user-info-name') info['artist'] = uname.text.strip() s = cut_pair(html.split('window.__ssr_data = JSON.parse("')[1]) j = json.loads(json.loads(u'"{}"'.format(s))) if '/detail/' in url: info['uid'] = j['detail']['detail_user']['uid'] info['id'] = j['detail']['post_data']['item_id'] else: info['uid'] = j['homeInfo']['uid'] return info
def f(url): if '/viewer/' in url: html = read_html(url) id = re.find('/works/([0-9]+)', html) url = ('https://comic.pixiv.net/works/{}').format(id) html = read_html(url) soup = Soup(html) pages = get_pages(soup, url) return pages
def get_imgs(url, title, cw=None): print_ = get_print(cw) imgs = [] for p in range(1, 1001): url = setPage(url, p) print_(url) for try_ in range(4): try: html = downloader.read_html( url, user_agent=downloader.hdr['User-Agent']) #sleep(1) break except Exception as e: print(e) else: raise soup = Soup(html) view = soup.find('div', class_='photos-list') if view is None: if p == 1: raise errors.LoginRequired() else: break # Guest user for img in view.findAll('img'): img = img.attrs['data-src'] img = Image(img, url, len(imgs)) imgs.append(img) pgn = soup.find('ul', class_='pagination') ps = [getPage(a.attrs['href']) for a in pgn.findAll('a')] if pgn else [] if not ps or p >= max(ps): print('max p') break msg = '{} {} ({} / {})'.format(tr_('읽는 중...'), title, p, max(ps)) if cw: cw.setTitle(msg) else: print(msg) return imgs
def read_gallery(url, cw=None): print_ = get_print(cw) info = {} html = downloader.read_html(url) soup = Soup(html) h1 = soup.find('h1') if h1.find('a'): url = h1.find('a')['href'] return read_gallery(url, cw) info['title'] = h1.text.strip() info['url'] = setPage(url, 1) imgs = [] ids = set() for p in range(1, 101): print_('p: {}'.format(p)) url = setPage(url, p) html = downloader.read_html(url) data = get_data(html) photos = data['photosGalleryModel']['photos'] if not photos: print('no photos') break for photo in photos: img = photo['imageURL'] id = photo['id'] referer = photo['pageURL'] if id in ids: print('duplicate:', id) continue ids.add(id) img = Image(img, id, referer) imgs.append(img) info['imgs'] = imgs return info
def get(self, url_page): if not self._url: id = get_id(url_page) html = downloader.read_html(url_page) soup = Soup(html, unescape=True) self.title = soup.find('title').text.replace('- XVIDEOS.COM', '').strip() url = re.find(r'''.setVideoHLS\(['"](.+?)['"]\)''', html) ext = get_ext(url) if ext.lower() == '.m3u8': url = playlist2stream(url, n_thread=5) url_thumb = soup.find('meta', { 'property': 'og:image' }).attrs['content'] self.thumb = BytesIO() downloader.download(url_thumb, buffer=self.thumb) self.filename = format_filename(self.title, id, '.mp4') self._url = url return self._url
def get_imgs_page(id_art, session, date=None, cw=None): print_ = get_print(cw) url_json = 'https://www.artstation.com/projects/{}.json'.format(id_art) post_url = 'https://www.artstation.com/artwork/{}'.format(id_art) try: html = downloader.read_html(url_json, session=session, referer=post_url) data = json.loads(html) imgs_ = data['assets'] except Exception as e: print_(print_error(e)[(-1)]) return [] if date is None: date = data['created_at'][2:10] imgs = [] for page, img in enumerate(imgs_): if not img['has_image']: print('no img') continue url = None video = None embed = img.get('player_embedded') if embed: soup = Soup(embed) url_embed = soup.find('iframe').attrs['src'] print_('embed: {}'.format(url_embed)) try: html = downloader.read_html(url_embed, session=session, referer=post_url) soup = Soup(html) url = soup.find('video').find('source').attrs['src'] except Exception as e: pass if not url: try: url = soup.find('link', {'rel': 'canonical'}).attrs['href'] print_('YouTube: {}'.format(url)) raise Exception('YouTube') ## from extractor import youtube_downloader ## video = youtube_downloader.Video(url, cw=cw) except Exception as e: print(e) url = None if not url: url = img['image_url'] if video: img = video else: img = Image(post_url, date, url, page) img.data = data # imgs.append(img) return imgs
def init(self): if 'members.php' not in self.url and 'members_illust.php' not in self.url: raise NotImplementedError() id = get_id(self.url) html = downloader.read_html( 'https://nijie.info/members.php?id={}'.format(id)) self.soup = Soup(html) if not isLogin(self.soup): raise LoginRequired()
def get_pages(url, session, soup=None, cw=None): if soup is None: html = downloader.read_html(url, session=session) soup = Soup(html) tab = soup.find('ul', class_='list-chapters') pages = [] for li in tab.findAll('li'): text = li.find('div', class_='chapter-name').text.strip() href = li.parent['href'] href = urljoin(url, href) page = Page(text, href) pages.append(page) if not pages: raise Exception('no pages') return pages[::-1]
def get_imgs(page, session, cw): print_ = get_print(cw) if not downloader.cookiejar.get( 'PROF', domain='.daum.net') and page.serviceType != 'free': #3314 raise NotPaidError() html = downloader.read_html(page.url, session=session) header, id = get_id(page.url) t = int(time()) soup = Soup(html) type_ = header_to_type(header) url_data = 'http://webtoon.daum.net/data/pc/{}/viewer/{}?timeStamp={}'.format( type_, id, t) data_raw = downloader.read_html(url_data, session=session, referer=page.url) data = json.loads(data_raw) if header == 'league_': m_type = None else: m_type = data['data']['webtoonEpisode']['multiType'] print_('m_type: {}'.format(m_type)) if m_type == 'chatting': page.url = page.url.replace('daum.net/', 'daum.net/m/') url_data = 'http://webtoon.daum.net/data/mobile/{}/viewer?id={}&{}'.format( type_, id, t) data_raw = downloader.read_html(url_data, session=session, referer=page.url) data = json.loads(data_raw) imgs = [] for chat in data['data']['webtoonEpisodeChattings']: img = chat.get('image') if not img: continue img = Image(img['url'], page, len(imgs)) imgs.append(img) else: url_data = 'http://webtoon.daum.net/data/pc/{}/viewer_images/{}?timeStamp={}'.format( type_, id, t) data_raw = downloader.read_html(url_data, session=session, referer=page.url) data = json.loads(data_raw) if not data.get('data'): raise NotPaidError() imgs = [] for img in data['data']: img = Image(img['url'], page, len(imgs)) imgs.append(img) return imgs
def get_sd(url, session=None, html=None, cw=None, wait=True): print_ = get_print(cw) if html: soup = Soup(html) check_error(soup, cw, wait) for script in soup.findAll('script'): j = get_j(script) if j: break else: raise Exception('no _sharedData!!') else: for try_ in range(4): _wait(cw) html = read_html(url, session, cw) soup = Soup(html) check_error(soup, cw, wait) for script in soup.findAll('script'): j = get_j(script) if j: break else: continue break else: raise Exception('no _sharedData') for script in soup.findAll('script'): s = script.string if s and 'window.__additionalDataLoaded(' in s: s = cut_pair(s) j_add = json.loads(s) try: j['entry_data']['PostPage'][0].update(j_add) except: j['entry_data']['ProfilePage'][0].update(j_add) #2900 # Challenge challenge = j['entry_data'].get('Challenge') if challenge: try: for cont in challenge[0]['extraData']['content']: title = cont.get('title') if title: break else: raise Exception('no title') except: title = 'Err' raise errors.LoginRequired(title) # LoginAndSignupPage login = j['entry_data'].get('LoginAndSignupPage') if login: raise errors.LoginRequired() return j
def init(self): cw = self.cw self.session = Session() res = clf2.solve(self.url, self.session, cw) soup = Soup(res['html']) if is_captcha(soup): def f(html): return not is_captcha(Soup(html)) clf2.solve(self.url, self.session, cw, show=True, f=f)
def _get(self, url_page): id = get_id(url_page) html = downloader.read_html(url_page) soup = Soup(html) self.title = html_unescape(soup.find('title').text).replace( '- XVIDEOS.COM', '').strip() url = re.find(r'''.setVideoHLS\(['"](.+?)['"]\)''', html) or re.find( r'''.setVideoUrlHigh\(['"](.+?)['"]\)''', html) or re.find( r'''.setVideoUrlLow\(['"](.+?)['"]\)''', html) #https://www.xvideos.com/video65390539/party_night if not url: raise Exception('no video url') ext = get_ext(url) if ext.lower() == '.m3u8': url = playlist2stream(url, n_thread=5, res=get_resolution()) #4773 self.url_thumb = soup.find('meta', { 'property': 'og:image' }).attrs['content'] self.filename = format_filename(self.title, id, '.mp4') self._url = url
def get_video(url): html = downloader.read_html(url) soup = Soup(html) for script in soup.findAll('script'): script = script.text or script.string or '' hls = re.find(r'''html5player\.setVideoHLS\(['"](.+?)['"]''', script) if hls: break else: raise Exception('No VideoHLS') video = playlist2stream(hls) title = get_title(soup) url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'].strip() video = Video(video, url, title, url_thumb) return video
def fix_soup(soup, url, session=None, cw=None): ''' fix_soup ''' print_ = get_print(cw) if soup.find('div', class_='logo'): return soup print_('invalid soup: {}'.format(url)) res = clf2.solve(url, session=session, cw=cw) return Soup(res['html'])
def soup(self): if self._soup is None: for try_ in range(8): try: html = downloader.read_html(self.url, session=self.session) break except Exception as e: print(e) else: raise self._soup = Soup(html) return self._soup
def get_imgs_page(page, session=None, cw=None): url = page.url soup = page.soup if soup is None: html = read_html(url, session=session, cw=None) soup = Soup(html) page.soup = soup view = soup.find('div', class_='entry-content') imgs = [] for img in view.findAll('img'): img = img.attrs.get('data-lazy-src') or img.attrs.get('data-src') if img is None: continue img = urljoin(url, img) img = Image(img, len(imgs), page, cw) imgs.append(img) print(page.title, len(imgs), page.url) return imgs
def get_text(url, subtitle, update, session): html = downloader.read_html(url, session=session) soup = Soup(html) p = soup.find('div', id='novel_p') p = '' if p is None else p.text.strip() story = soup.find('div', id='novel_honbun').text.strip() if update: update = u' ' + update else: update = '' if p: story = (u'{}\n\n{}').format(p, story) text =u'''──────────────────────────────── ◆ {}{} ──────────────────────────────── {}'''.format(subtitle, update, story) return text
def get(self, url): cw = self.cw d = self.d print_ = get_print(cw) for try_ in range(4): wait(cw) html = '' try: html = downloader.read_html(url, referer=self.referer, session=self.session) #url = 'https:' + re.findall('[Oo]riginal:? ?<a href="(//[0-9a-zA-Z_-]{2,2}.sankakucomplex.com/data/.{0,320}?)"', html)[0] soup = Soup(html) highres = soup.find(id='highres') url = urljoin(url, highres['href'] if highres else soup.find(id='image')['src']) break except Exception as e: e_msg = print_error(e)[0] if '429 Too many requests'.lower() in html.lower(): t_sleep = 120 * min(try_ + 1, 2) e = '429 Too many requests... wait {} secs'.format(t_sleep) elif 'post-content-notification' in html: # sankaku plus print_('Sankaku plus: {}'.format(self.id)) return '' else: t_sleep = 5 s = '[Sankaku] failed to read image (id:{}): {}'.format(self.id, e) print_(s) sleep(t_sleep, cw) else: raise Exception('can not find image (id:{})\n{}'.format(self.id, e_msg)) soup = Soup('<p>{}</p>'.format(url)) url = soup.string ext = os.path.splitext(url)[1].split('?')[0] self.filename = '{}{}'.format(self.id, ext) return url
def get_video(url, session, cw): print_ = get_print(cw) html = downloader.read_html(url, session=session) if "document.location.href='https://login." in html: raise errors.LoginRequired() soup = Soup(html) url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] print_('url_thumb: {}'.format(url_thumb)) params = re.find('VodParameter *= *[\'"]([^\'"]+)[\'"]', html, err='No VodParameter') params += '&adultView=ADULT_VIEW&_={}'.format(int(time()*1000)) url_xml = 'http://stbbs.afreecatv.com:8080/api/video/get_video_info.php?' + params print(url_xml) html = downloader.read_html(url_xml, session=session, referer=url) soup = Soup(html) if '<flag>PARTIAL_ADULT</flag>' in html: raise errors.LoginRequired() title = soup.find('title').string.strip() urls_m3u8 = re.findall('https?://[^>]+playlist.m3u8', html) if not urls_m3u8: raise Exception('no m3u8') streams = [] for url_m3u8 in urls_m3u8: try: stream = _get_stream(url_m3u8) except Exception as e: print(e) continue #2193 streams.append(stream) for stream in streams[1:]: streams[0] += stream stream = streams[0] id = url.split('/')[(-1)].split('?')[0].split('#')[0] video = Video(stream, url, id, title, url_thumb) return video
def get_video(url, session): while url.strip().endswith('/'): url = url[:-1] html = downloader.read_html(url, session=session) soup = Soup(html) url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] params = re.findall('VodParameter *= *[\'"]([^\'"]+)[\'"]', html)[0] params += '&adultView=ADULT_VIEW&_={}'.format(int(time() * 1000)) url_xml = 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php?' + params print(url_xml) html = downloader.read_html(url_xml, session=session, referer=url) soup = Soup(html) title = soup.find('title').string.strip() urls_m3u8 = re.findall('https?://[^>]+playlist.m3u8', html) if not urls_m3u8: raise Exception('no m3u8') streams = [] for url_m3u8 in urls_m3u8: try: stream = _get_stream(url_m3u8) except Exception as e: print(e) continue #2193 streams.append(stream) for stream in streams[1:]: streams[0] += stream stream = streams[0] id = url.split('/')[(-1)].split('?')[0].split('#')[0] video = Video(stream, url, id, title, url_thumb) return video
def get_video(url): html = downloader.read_html(url) soup = Soup(html) view = soup.find('div', id='player-container-fluid') src_best = None res_best = -1 for source in view.findAll('source'): src = urljoin(url, source.attrs['src']) res = re.find('([0-9]+)p', source.attrs['title']) res = int(res) if res else 0 if res > res_best: src_best = src res_best = res if src_best is None: raise Exception('No source') title = soup.find('h1').text.strip() id = soup.find('div', id='video').attrs['data-id'] url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] #src_best = downloader.real_url(src_best) video = Video(src_best, url_thumb, url, title, id) return video
def read_channel(url, cw=None): print_ = get_print(cw) username = url.split('/users/')[1].split('/')[0] info = {} html = downloader.read_html(url) soup = Soup(html) title = soup.find('div', class_='user-name').text.strip() info['title'] = u'[Channel] {}'.format(title) items = [] for p in range(1, 21): url = 'https://xhamster.com/users/{}/videos/{}'.format(username, p) print_(url) html = downloader.read_html(url) soup = Soup(html) items_ = soup.findAll('div', class_='thumb-list__item') if not items_: print('no items') break for item in items_: items.append(item) urls = [] for item in items: url = item.a.attrs['href'] if url in urls: print('duplicate:', url) continue urls.append(url) info['urls'] = urls return info
def get_info(url, soup=None): if soup is None: html = downloader.read_html(url) soup = Soup(html) info = {} info['title'] = soup.find('h1', id='workTitle').text.strip() info['artist'] = soup.find('span', id='workAuthor-activityName').text.strip() desc = soup.find('section', id='description') button = desc.find('span', class_='ui-truncateTextButton-expandButton') if button: print('decompose button') button.decompose() catch = desc.find('span', id='catchphrase-body').text.strip() intro = desc.find('p', id='introduction').text.strip() desc = u' {}\n\n\n{}'.format(catch, intro) info['description'] = desc pages = [] for a in soup.findAll('a', class_='widget-toc-episode-episodeTitle'): href = urljoin(url, a.attrs['href']) subtitle = a.find('span', class_='widget-toc-episode-titleLabel').text.strip() date = a.find('time', class_='widget-toc-episode-datePublished').text.strip() page = Page(href, subtitle, date, len(pages) + 1) pages.append(page) info['pages'] = pages return info
def get_info(self): ''' 通过登录会话session获取学生信息 :param sess: :return: 学生信息 ''' ifo_url = 'http://219.242.68.33/xuesheng/xsxx.aspx' soup = Soup(self.http_request.session, ifo_url) data = {} data['a.姓名'] = soup.find(id="ctl00_ContentPlaceHolder1_lblXm").text data['b.身份证号'] = soup.find(id="ctl00_ContentPlaceHolder1_lblSfz").text data['c.学号'] = soup.find(id="ctl00_ContentPlaceHolder1_lblXh").text data['d.班级'] = soup.find(id="ctl00_ContentPlaceHolder1_className").text data['e.院系'] = soup.find(id="ctl00_ContentPlaceHolder1_collegeName").text if self.display is True: tabletitle = [item[2:] for item in sorted(data.keys())] cont = [data[item] for item in sorted(data.keys())] table_print(tabletitle, cont) return data