def get_books(self, search_info: str) -> List[basesite.Book]: headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0' } # data = {'t': '1', 'searchkey': urllib.parse.quote(search_info.encode('GB2312'))} # 错误 data = f'searchkey={urllib.parse.quote(search_info.encode("GB2312"))}&t=1' r = self.try_post_url(self.session, url=self.search_url, try_timeout=5, headers=headers, data=data, allow_redirects=False) if r is None: return [] if r.status_code == 302: # 只找到一本书,将跳转 return [ basesite.Book(site=self, url=r.headers['Location'], name=search_info, author="", brief="") ] soup = BeautifulSoup(r.content.decode(self.encoding, 'ignore'), 'html.parser') if not (book_soup_list := soup.select('div.ml212 dt')): return []
def get_books(self, search_info: str) -> List[basesite.Book]: url = self.search_url % urllib.parse.quote(search_info) r = self.try_get_url(self.session, url, try_timeout=5) if r is None or r.text.find("没有找到有关") >= 0: return [] soup = BeautifulSoup(r.content, 'html.parser') book_soup_list = soup.select('div.content > article') search_book_results = [] for book_soup in book_soup_list: tmp_soup = book_soup.select_one('header > h2 > a') book_url = tmp_soup.attrs['href'] m = re.search(r'《(.*?)》.* (\w+) 著', tmp_soup.text) book_name = m.group(1) book_author = m.group(2) tmp_text = book_soup.select_one('p.note').text if m2 := re.search(r"简介(:)?(.*)", tmp_text): book_brief = m2.group(2).strip() else: book_brief = tmp_text.strip() book = basesite.Book(site=self, url=book_url, name=book_name, author=book_author, brief=book_brief) search_book_results.append(book)
def get_books(self, search_info: str) -> List[basesite.Book]: url = self.search_url % urllib.parse.quote(search_info) r = self.try_get_url(self.session, url, try_timeout=5) if r is None: return [] soup = BeautifulSoup(r.content, 'html.parser') book_tag_list = soup.select('tr') book_num = len(book_tag_list) - 1 if book_num == 0: return [] search_book_results = [] book_soup_list = book_tag_list[1:] for book_soup in book_soup_list: td_list = book_soup.findAll('td') book_url = self.site_url + td_list[1].find('a').attrs['href'] book_name = td_list[1].find('a').text book_author = td_list[2].text book_brief = f"最新章节:{td_list[3].find('a').text} 更新时间:{td_list[4].text.strip()}" book = basesite.Book(site=self, url=book_url, name=book_name, author=book_author, brief=book_brief) search_book_results.append(book) return search_book_results
def get_books(self, search_info: str) -> List[basesite.Book]: r = self.try_post_url( self.session, url=self.search_url, try_timeout=5, params=f'searchtype=all&searchkey={urllib.parse.quote(search_info)}' ) if r is None: return [] soup = BeautifulSoup(r.content, 'html.parser') book_tag_list = soup.select('div.novelslist2 > ul > li') book_num = len(book_tag_list) - 1 if book_num == 0: return [] search_book_results = [] book_soup_list = book_tag_list[1:] for book_soup in book_soup_list: span_list = book_soup.findAll('span') book_url = self.base_url + span_list[1].find('a').attrs['href'] book_name = span_list[1].find('a').text book_author = span_list[3].text book_brief = f"最新章节:{span_list[2].find('a').text} 更新时间:{span_list[4].text.strip()}" book = basesite.Book(site=self, url=book_url, name=book_name, author=book_author, brief=book_brief) search_book_results.append(book) return search_book_results
class Fox2018Site(basesite.BaseSite): def __init__(self): self.site_info = basesite.SiteInfo( type='文学', statue='上线版本', url='http://www.fox2018.com', name='青少年读书网', brief_name='青少年', version='1.1', max_threading_number=50, ) super().__init__(self.site_info) self.base_url = 'http://www.fox2018.com' self.encoding = 'GB2312' self.search_url = 'http://www.fox2018.com/e/search/index.php' self.session = requests.session() @basesite.print_in_out def get_books(self, search_info: str) -> List[basesite.Book]: headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0' } # data = {'keyboard': urllib.parse.quote(search_info.encode(self.encoding)),..} # 不能用字典形式,耗费5月2日半天时间 data = f'show=title&classid=1&tempid=4&keyboard={urllib.parse.quote(search_info.encode(self.encoding))}' r = self.try_post_url(self.session, url=self.search_url, try_timeout=5, headers=headers, data=data) if r is None: return [] soup = BeautifulSoup(r.content.decode(self.encoding, 'ignore'), 'html.parser') # soup = BeautifulSoup(r.content, 'html.parser') if not (book_soup_list := soup.select('div.classify_list a')): return [] search_book_results = [] for book_soup in book_soup_list: book_url = self.base_url + book_soup.attrs['href'] book_name = book_soup.select_one('h3').text book_author = book_soup.select_one('p.author i').text book_brief = book_soup.select_one('p.brief').text book = basesite.Book(site=self, url=book_url, name=book_name, author=book_author, brief=book_brief) search_book_results.append(book) return search_book_results
def get_books(self, search_info: str) -> List[basesite.Book]: url = self.search_url % urllib.parse.quote(search_info) r = self.try_get_url(self.session, url, try_timeout=5) soup = BeautifulSoup(r.content, 'html.parser') book_soup_list = soup.select('tbody > tr') search_book_results = [] for book_soup in book_soup_list: td_soup_list = book_soup.select('td') book_url = self.base_url + td_soup_list[0].select_one( 'a').attrs['href'] if book_url.find('search.html') != -1: continue book_name = td_soup_list[0].text book_author = td_soup_list[1].text book_brief = "无" book = basesite.Book(site=self, url=book_url, name=book_name, author=book_author, brief=book_brief) search_book_results.append(book) return search_book_results
class Shuku87Site(basesite.BaseSite): def __init__(self): self.site_info = basesite.SiteInfo( type='网络小说', statue='上线版本', url='http://www.87xiaoshuo.net', name='霸气书库', brief_name='霸气网', version='1.1', max_threading_number=3, ) super().__init__(self.site_info) self.base_url = 'http://www.87xiaoshuo.net' self.encoding = 'GB2312' self.search_url = 'http://www.87xiaoshuo.net/modules/article/search.php' self.session = requests.session() @basesite.print_in_out def get_books(self, search_info: str) -> List[basesite.Book]: headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0' } # data = {'t': '1', 'searchkey': urllib.parse.quote(search_info.encode('GB2312'))} # 错误 data = f'searchkey={urllib.parse.quote(search_info.encode("GB2312"))}&t=1' r = self.try_post_url(self.session, url=self.search_url, try_timeout=5, headers=headers, data=data, allow_redirects=False) if r is None: return [] if r.status_code == 302: # 只找到一本书,将跳转 return [ basesite.Book(site=self, url=r.headers['Location'], name=search_info, author="", brief="") ] soup = BeautifulSoup(r.content.decode(self.encoding, 'ignore'), 'html.parser') if not (book_soup_list := soup.select('div.ml212 dt')): return [] search_book_results = [] for book_soup in book_soup_list: book_url = book_soup.select_one('a').attrs['href'] m = re.search(r'(\w+).*作者:(\w+).*?(\w+.*)$', book_soup.text, flags=re.DOTALL) if not m: print( f'error in {self.site_info.brief_name} {book_url=} {book_soup.text=}' ) return [] book_name = m.group(1) book_author = m.group(2) book_brief = m.group(3).replace("\n", "").replace("\r", "").strip() book = basesite.Book(site=self, url=book_url, name=book_name, author=book_author, brief=book_brief) search_book_results.append(book) return search_book_results