def init(self): self.url = clean_url(self.url) self.session = Session() if re.search(PATTERN_ID, self.url): #1799 select = self.soup.find('select', class_='bookselect') for i, op in enumerate(select.findAll('option')[::-1]): if 'selected' in op.attrs: break else: raise Exception('no selected option') for a in self.soup.findAll('a'): url = urljoin(self.url, a.get('href') or '') if re.search(PATTERN, url): break else: raise Exception('list not found') self.url = self.fix_url(url) self._soup = None for i, page in enumerate( get_pages(self.url, self.session, self.soup)): if page.id == int(op['value']): break else: raise Exception('can not find page') self.cw.range_p = [i]
def get_imgs(url, title=None, cw=None): url = clean_url(url) if 's=view' in url and 'page=favorites' not in url: raise NotImplementedError('Not Implemented') tags = get_tags(url) tags = quote(tags, safe='/') tags = tags.replace('%20', '+') url = 'https://gelbooru.com/index.php?page=post&s=list&tags={}'.format( tags) print_ = get_print(cw) # Range max_pid = get_max_range(cw, 2000) imgs = [] ids = set() count_no_imgs = 0 for p in range(500): #1017 url = setPage(url, len(ids)) print_(url) html = downloader.read_html(url) soup = Soup(html) posts = soup.findAll('div', class_='thumbnail-preview') imgs_new = [] for post in posts: id_ = int(re.find('[0-9]+', post.find('a')['id'], err='no id')) if id_ in ids: print('duplicate:', id_) continue ids.add(id_) url_img = urljoin(url, post.find('a')['href']) img = Image(id_, url_img) imgs_new.append(img) if imgs_new: imgs += imgs_new count_no_imgs = 0 else: print('no imgs') count_no_imgs += 1 if count_no_imgs > 1: print('break') break if len(imgs) >= max_pid: break if cw is not None: if not cw.alive: break cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))) if not imgs: raise Exception('no imgs') return imgs
def get_imgs(url, title=None, customWidget=None): url = clean_url(url) if 's=view' in url and 'page=favorites' not in url: raise NotImplementedError('Not Implemented') if 'page=dapi' not in url.lower(): tags = get_tags(url).replace(' ', '+') url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format( tags, 0, LIMIT) if customWidget is not None: print_ = customWidget.print_ else: def print_(*values): sys.stdout.writelines(values + ('\n', )) # Range if customWidget is not None: range_pid = customWidget.range else: range_pid = None if range_pid is not None: max_pid = max(parse_range(range_pid, max=100000)) else: max_pid = 2000 imgs = [] url_imgs = set() for p in range(100): url = setPage(url, p) #print_(url) html = downloader.read_html(url) soup = BeautifulSoup(html, 'html.parser') posts = soup.findAll('post') if not posts: break for post in posts: url_img = post.attrs['file_url'] if url_img not in url_imgs: url_imgs.add(url_img) id = post.attrs['id'] img = Image(id, url_img) imgs.append(img) if len(imgs) >= max_pid: break if len(imgs) >= max_pid: break if customWidget is not None and not customWidget.alive: break if customWidget is not None: customWidget.exec_queue.put( (customWidget, u"customWidget.setTitle(u'{} {} - {}')".format( tr_(u'읽는 중...'), title, len(imgs)))) return imgs
def get_imgs_all(url, title=None, cw=None, d=None, session=None, stories=True): max_pid = get_max_range(cw) url = clean_url(url) if stories: imgs_str = get_stories(url, title, cw=cw, session=session) else: imgs_str = [] max_pid = max(0, max_pid - len(imgs_str)) imgs = get_imgs(url, max_pid, title=title, cw=cw, session=session) return imgs_str + imgs[:max_pid]
def get_imgs(url, title=None, cw=None): print_ = get_print(cw) url = clean_url(url) id = get_id(url) url = u'https://nijie.info/members_illust.php?id={}'.format(id) # Range max_pid = get_max_range(cw) imgs = [] url_imgs = set() for p in range(1, 1 + 100): url = setPage(url, p) print_(url) html = downloader.read_html(url) soup = Soup(html) posts = soup.findAll('div', class_='nijie') if not posts: print('no posts') break c = 0 for post in posts: url_img = urljoin(url, post.a.attrs['href']) if url_img in url_imgs: print('duplicate:', url_img) continue url_imgs.add(url_img) id = int(re.find('[?&]id=([0-9]+)', url_img)) multi = post.find('div', class_='thumbnail-icon') if multi: imgs_ = get_imgs_post(id, url_img) # else: imgs_ = [Image(id, url_img, 0)] imgs += imgs_ c += 1 if len(imgs) >= max_pid: break msg = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)) if cw: if not cw.alive: return cw.setTitle(msg) else: print(msg) if len(imgs) >= max_pid or c == 0: break return imgs
def get_tags(url): url = clean_url(url) qs = query_url(url) if 'page=favorites' in url: id = qs.get('id', ['N/A'])[0] id = u'fav_{}'.format(id) else: tags = qs.get('tags', []) tags.sort() id = u' '.join(tags) if not id: id = u'N/A' return id
def get_imgs(url, title=None, customWidget=None): url = clean_url(url) if 's=view' in url and 'page=favorites' not in url: raise NotImplementedError('Not Implemented') if 'page=dapi' not in url.lower(): tags = get_tags(url).replace(' ', '+') print(tags) tags = urllib.quote(tags, safe='/+') url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format( tags, 0, LIMIT) if customWidget is not None: print_ = customWidget.print_ else: def print_(*values): sys.stdout.writelines(values + ('\n', )) # Range max_pid = get_max_range(customWidget, 2000) imgs = [] url_imgs = set() for p in range(500): #1017 url = setPage(url, p) print_(url) html = downloader.read_html(url) soup = Soup(html) posts = soup.findAll('post') if not posts: break for post in posts: url_img = post.attrs['file_url'] if url_img in url_imgs: print('already exists', url_img) else: url_imgs.add(url_img) id = post.attrs['id'] img = Image(id, url_img) imgs.append(img) if len(imgs) >= max_pid: break if customWidget is not None: if not customWidget.alive: break customWidget.setTitle(u'{} {} - {}'.format( tr_(u'읽는 중...'), title, len(imgs))) return imgs
def get_imgs(url, title=None, cw=None): url = clean_url(url) if 's=view' in url and 'page=favorites' not in url: raise NotImplementedError('Not Implemented') if 'page=dapi' not in url.lower(): tags = get_tags(url) tags = quote(tags, safe='/') tags = tags.replace('%20', '+') url = "https://rule34.xxx/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format( tags, 0, LIMIT) print_ = get_print(cw) # Range max_pid = get_max_range(cw) imgs = [] ids = set() for p in range(500): #1017 url = setPage(url, p) print_(url) html = try_n(4, sleep=30)(downloader.read_html)(url) #3340 soup = Soup(html) posts = soup.findAll('post') if not posts: break for post in posts: id_ = post.attrs['id'] if id_ in ids: print('duplicate:', id_) continue ids.add(id_) url_img = post.attrs['file_url'] img = Image(id_, url_img) imgs.append(img) if len(imgs) >= max_pid: break if cw is not None: if not cw.alive: break cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))) return imgs
def init(self): type = self.url.split('sankakucomplex.com')[0].split('//')[-1].strip('.').split('.')[-1] if type == '': type = 'www' if type not in ['chan', 'idol', 'www']: raise Exception('Not supported subdomain') self.type_sankaku = type self.url = self.url.replace('&commit=Search', '') self.url = clean_url(self.url) self.session = Session() if self.type_sankaku != 'www': login(type, self.session, self.cw) if self.type_sankaku == 'www': html = downloader.read_html(self.url, session=self.session) self.soup = Soup(html)
def init(self): self.url = self.url.replace('sankaku_', '') if '/post/' in self.url: return self.Invalid('Single post is not supported') if 'sankakucomplex.com' in self.url: self.url = self.url.replace('http://', 'https://') type = self.url.split('sankakucomplex.com')[0].split( '//')[-1].strip('.').split('.')[-1] if type == '': type = 'www' if type not in ['chan', 'idol', 'www']: raise Exception('Not supported subdomain') else: url = self.url url = url.replace(' ', '+') while '++' in url: url = url.replace('++', '+') url = urllib.quote(url) url = url.replace('%2B', '+') url = url.replace('%20', '+') # if url.startswith('[chan]'): type = 'chan' url = url.replace('[chan]', '', 1).strip() elif url.startswith('[idol]'): type = 'idol' url = url.replace('[idol]', '', 1).strip() elif url.startswith('[www]'): type = 'www' url = url.replace('[www]', '', 1).strip() else: raise Exception('Not supported subdomain') self.url = u'https://{}.sankakucomplex.com/?tags={}'.format( type, url) self.type_sankaku = type self.url = self.url.replace('&commit=Search', '') self.url = clean_url(self.url) self.session = Session() if self.type_sankaku != 'www': login(type, self.session, self.customWidget) if self.type_sankaku == 'www': html = downloader.read_html(self.url, session=self.session) self.soup = Soup(html)