def parse_search_result(cls, html): if not html: return None main = BeautifulSoup(html) # 找到搜索结果section section = main.find("section", attrs={"class": "column-search-result"}) # 获取搜索结果中插画 datas = [] # 直接搜索section的class失败。则搜索ul 获取image item if not section: try: # 特殊处理首页用推荐的情况 section = main.find("ul", attrs={"class": "_image-items autopagerize_page_element"}) if not section: print("search normal result is empty") return datas except: print("search normal result is empty") return datas lis = section.findAll("li", attrs={"class": re.compile("image-item\s*")}) if not lis: try: # 特殊处理首页用推荐的情况 uls = main.findAll("ul", attrs={"class": "_image-items autopagerize_page_element"}) if len(uls) >= 2: ul = uls[1] lis = ul.findAll("li", attrs={"class": re.compile("image-item\s*")}) if not lis: print("search normal result is empty") return datas except: print("search normal result is empty") return datas for li in lis: try: data = {"url": PIXIV_URL + li.find("a", attrs={"class": re.compile("work _work\w*")})["href"], "title": li.find("h1", attrs={"class": "title"}).text} # 非关键信息 解析失败不影响主要信息收集 try: user = {} user_a = li.find("a", attrs={"class": "user ui-profile-popup"}) user["name"] = user_a["title"] user["id"] = user_a["data-user_id"] user["page"] = PIXIV_URL + user_a["href"] data["user"] = user except Exception, e: print("Parse User Warning") print(e.message) count_a = li.find("a", attrs={"class": "bookmark-count _ui-tooltip"}) if count_a: data["mark_count"] = li.find("a", attrs={"class": "bookmark-count _ui-tooltip"}).text else: data["mark_count"] = 0 data = parse_dict(data) datas.append(data) except Exception, e: print("parse_search_result Warning") print(e.message) continue
def parse_illustration_topic(cls, html): if not html: return None main = BeautifulSoup(html) lis = main.findAll("li", attrs={"class": "article-card-container"}) datas = [] for li in lis: try: data = { "label": li.find( "span", attrs={ "class": "arc__thumbnail-label _category-label large spotlight" }).text } a = li.find("h2", attrs={"class": "arc__title "}).find("a") data["href"] = BASE_URL + a["href"] data["title"] = a.text data["pub_time"] = li.find("time").text data["tags"] = [] tags = li.findAll("div", attrs={"class": "tls__list-item small"}) for tag in tags: data["tags"].append(tag.text) data = parse_dict(data) datas.append(data) except Exception, e: print("Get Topics Warning") print(e) continue
def find_title_image(cls, main): try: aie_info = main.find("div", attrs={"class": "aie__icon-title-container"}) data = { "image": aie_info.find("img", attrs={"class": "aie__uesr-icon"})["src"] } user_container = aie_info.find( "div", attrs={"class": "aie__title-user-name-container"}) image_a = user_container.find("p", attrs={ "class": "aie__title" }).find("a") data["image_page"] = image_a['href'] data["title"] = image_a.text user_a = user_container.find("p", attrs={ "class": "aie__user-name" }).find("a") data["author_page"] = user_a["href"] data["author"] = user_a.text data = parse_dict(data) return data except Exception, e: print("Get topic Title Warning:") print(e) return None
def parse_search_result(cls, html): if not html: return None main = BeautifulSoup(html) result_list = main.find("input", attrs={"id": "js-mount-point-search-result-list"}) datas = [] if input: try: json_str = str(result_list['data-items']).replace('"', '"').replace(""quot;", '"') if not json_str or len(json_str) <= 0: print("search normal result is empty") return datas items = json.loads(json_str) if items and len(items) > 0: for item in items: user = { "name": item['userName'], "id": item['userId'], } data = { "url": item['url'], "title": item['illustTitle'], "id": item['illustId'], "mark_count": item['bookmarkCount'], 'user': user } datas.append(parse_dict(data)) except Exception: print("search normal result is empty") return datas
def parse_illustration(cls, html): if not html: return None main = BeautifulSoup(html) datas = [] # title_data = HtmlDownloader.find_title_image(main) # if title_data: # datas.append(title_data) divs = main.findAll( "div", attrs={ "class": re.compile('am__work gtm__illust-collection-illusts-\d*') }) # 适配某些专题页面 if not divs: divs = main.findAll("div", attrs={"class": "am__work"}) for div in divs: try: data = {} author_a = div.find( "a", attrs={"class": "author-img-container inner-link"}) title_a = div.find("h3", attrs={ "class": "am__work__title" }).find("a") data["author"] = author_a.text data["author_page"] = author_a['href'] data["title"] = title_a.text data["image_page"] = title_a["href"] image_img = div.find( "img", attrs={"class": re.compile("am__work__illust\w*")}) # 适配动图 if not image_img: image_img = div.find("img", attrs={"class": "ugoira-poster"}) if image_img: data["image"] = image_img["src"] else: print( data["title"] + ":" + data["image_page"] + ":" + "can't find image。please use quality=1 mode download") data = parse_dict(data) datas.append(data) except Exception as e: print("Parse illustrations Warning:") print(e) continue return datas
def parse_popular_introduction(cls, html): if not html: return None main = BeautifulSoup(html) section = main.find("section", attrs={"class": "popular-introduction"}) datas = [] if not section: print("search popular result is empty") return datas lis = section.findAll("li", attrs={"class": re.compile("image-item\s*")}) if not lis: print("search popular result is empty") return datas for li in lis: try: data = {"url": PIXIV_URL + li.find("a", attrs={"class": re.compile("work _work\w*")})["href"], "title": li.find("h1", attrs={"class": "title"}).text} data = parse_dict(data) datas.append(data) except Exception, e: print("parse_popular_introduction Warning:") print(e.message) continue
def testbs4(): from bs4 import BeautifulSoup import re html = open("test.html").read() soup = BeautifulSoup(html) lis = soup.find_all("li", class_=re.compile("image-item\s*")) datas = [] for li in lis: try: url = li.find_all("a", class_=re.compile("work _work\s*")) print(url[0]) data = {"url": PIXIV_URL + li.find_all("a", class_=re.compile("work _work\w*"), limit=1)[0]['href'], "title": li.find("h1", attrs={"class": "title"}).text} # 非关键信息 解析失败不影响主要信息收集 try: user = {} user_a = li.find("a", attrs={"class": "user ui-profile-popup"}) user["name"] = user_a["title"] user["id"] = user_a["data-user_id"] user["page"] = PIXIV_URL + user_a["href"] data["user"] = user except Exception as e: print("Parse User Warning") print(e.message) count_a = li.find("a", attrs={"class": "bookmark-count _ui-tooltip"}) if count_a: data["mark_count"] = li.find("a", attrs={"class": "bookmark-count _ui-tooltip"}).text else: data["mark_count"] = 0 data = parse_dict(data) datas.append(data) except Exception as e: print("parse_search_result Warning") print(e.message) continue return datas