Beispiel #1
0
 def parse_search_result(cls, html):
     if not html:
         return None
     main = BeautifulSoup(html)
     # 找到搜索结果section
     section = main.find("section", attrs={"class": "column-search-result"})
     # 获取搜索结果中插画
     datas = []
     # 直接搜索section的class失败。则搜索ul 获取image item
     if not section:
         try:
             # 特殊处理首页用推荐的情况
             section = main.find("ul", attrs={"class": "_image-items autopagerize_page_element"})
             if not section:
                 print("search normal result is empty")
                 return datas
         except:
             print("search normal result is empty")
             return datas
     lis = section.findAll("li", attrs={"class": re.compile("image-item\s*")})
     if not lis:
         try:
             # 特殊处理首页用推荐的情况
             uls = main.findAll("ul", attrs={"class": "_image-items autopagerize_page_element"})
             if len(uls) >= 2:
                 ul = uls[1]
                 lis = ul.findAll("li", attrs={"class": re.compile("image-item\s*")})
             if not lis:
                 print("search normal result is empty")
                 return datas
         except:
             print("search normal result is empty")
             return datas
     for li in lis:
         try:
             data = {"url": PIXIV_URL + li.find("a",
                                                attrs={"class": re.compile("work  _work\w*")})["href"],
                     "title": li.find("h1", attrs={"class": "title"}).text}
             # 非关键信息 解析失败不影响主要信息收集
             try:
                 user = {}
                 user_a = li.find("a", attrs={"class": "user ui-profile-popup"})
                 user["name"] = user_a["title"]
                 user["id"] = user_a["data-user_id"]
                 user["page"] = PIXIV_URL + user_a["href"]
                 data["user"] = user
             except Exception, e:
                 print("Parse User Warning")
                 print(e.message)
             count_a = li.find("a", attrs={"class": "bookmark-count _ui-tooltip"})
             if count_a:
                 data["mark_count"] = li.find("a", attrs={"class": "bookmark-count _ui-tooltip"}).text
             else:
                 data["mark_count"] = 0
             data = parse_dict(data)
             datas.append(data)
         except Exception, e:
             print("parse_search_result Warning")
             print(e.message)
             continue
Beispiel #2
0
 def parse_illustration_topic(cls, html):
     if not html:
         return None
     main = BeautifulSoup(html)
     lis = main.findAll("li", attrs={"class": "article-card-container"})
     datas = []
     for li in lis:
         try:
             data = {
                 "label":
                 li.find(
                     "span",
                     attrs={
                         "class":
                         "arc__thumbnail-label _category-label large spotlight"
                     }).text
             }
             a = li.find("h2", attrs={"class": "arc__title "}).find("a")
             data["href"] = BASE_URL + a["href"]
             data["title"] = a.text
             data["pub_time"] = li.find("time").text
             data["tags"] = []
             tags = li.findAll("div",
                               attrs={"class": "tls__list-item small"})
             for tag in tags:
                 data["tags"].append(tag.text)
             data = parse_dict(data)
             datas.append(data)
         except Exception, e:
             print("Get Topics Warning")
             print(e)
             continue
Beispiel #3
0
 def find_title_image(cls, main):
     try:
         aie_info = main.find("div",
                              attrs={"class": "aie__icon-title-container"})
         data = {
             "image":
             aie_info.find("img", attrs={"class": "aie__uesr-icon"})["src"]
         }
         user_container = aie_info.find(
             "div", attrs={"class": "aie__title-user-name-container"})
         image_a = user_container.find("p", attrs={
             "class": "aie__title"
         }).find("a")
         data["image_page"] = image_a['href']
         data["title"] = image_a.text
         user_a = user_container.find("p",
                                      attrs={
                                          "class": "aie__user-name"
                                      }).find("a")
         data["author_page"] = user_a["href"]
         data["author"] = user_a.text
         data = parse_dict(data)
         return data
     except Exception, e:
         print("Get topic Title Warning:")
         print(e)
         return None
 def parse_search_result(cls, html):
     if not html:
         return None
     main = BeautifulSoup(html)
     result_list = main.find("input", attrs={"id": "js-mount-point-search-result-list"})
     datas = []
     if input:
         try:
             json_str = str(result_list['data-items']).replace('"', '"').replace("&quotquot;", '"')
             if not json_str or len(json_str) <= 0:
                 print("search normal result is empty")
                 return datas
             items = json.loads(json_str)
             if items and len(items) > 0:
                 for item in items:
                     user = {
                         "name": item['userName'],
                         "id": item['userId'],
                     }
                     data = {
                         "url": item['url'],
                         "title": item['illustTitle'],
                         "id": item['illustId'],
                         "mark_count": item['bookmarkCount'],
                         'user': user
                     }
                     datas.append(parse_dict(data))
         except Exception:
             print("search normal result is empty")
     return datas
 def parse_illustration(cls, html):
     if not html:
         return None
     main = BeautifulSoup(html)
     datas = []
     # title_data = HtmlDownloader.find_title_image(main)
     # if title_data:
     #     datas.append(title_data)
     divs = main.findAll(
         "div",
         attrs={
             "class":
             re.compile('am__work gtm__illust-collection-illusts-\d*')
         })
     # 适配某些专题页面
     if not divs:
         divs = main.findAll("div", attrs={"class": "am__work"})
     for div in divs:
         try:
             data = {}
             author_a = div.find(
                 "a", attrs={"class": "author-img-container inner-link"})
             title_a = div.find("h3", attrs={
                 "class": "am__work__title"
             }).find("a")
             data["author"] = author_a.text
             data["author_page"] = author_a['href']
             data["title"] = title_a.text
             data["image_page"] = title_a["href"]
             image_img = div.find(
                 "img", attrs={"class": re.compile("am__work__illust\w*")})
             # 适配动图
             if not image_img:
                 image_img = div.find("img",
                                      attrs={"class": "ugoira-poster"})
             if image_img:
                 data["image"] = image_img["src"]
             else:
                 print(
                     data["title"] + ":" + data["image_page"] + ":" +
                     "can't find image。please use quality=1 mode download")
             data = parse_dict(data)
             datas.append(data)
         except Exception as e:
             print("Parse illustrations Warning:")
             print(e)
             continue
     return datas
Beispiel #6
0
 def parse_popular_introduction(cls, html):
     if not html:
         return None
     main = BeautifulSoup(html)
     section = main.find("section", attrs={"class": "popular-introduction"})
     datas = []
     if not section:
         print("search popular result is empty")
         return datas
     lis = section.findAll("li", attrs={"class": re.compile("image-item\s*")})
     if not lis:
         print("search popular result is empty")
         return datas
     for li in lis:
         try:
             data = {"url": PIXIV_URL + li.find("a", attrs={"class": re.compile("work  _work\w*")})["href"],
                     "title": li.find("h1", attrs={"class": "title"}).text}
             data = parse_dict(data)
             datas.append(data)
         except Exception, e:
             print("parse_popular_introduction Warning:")
             print(e.message)
             continue
Beispiel #7
0
def testbs4():
    from bs4 import BeautifulSoup
    import re
    html = open("test.html").read()
    soup = BeautifulSoup(html)
    lis = soup.find_all("li", class_=re.compile("image-item\s*"))
    datas = []
    for li in lis:
        try:
            url = li.find_all("a", class_=re.compile("work _work\s*"))
            print(url[0])
            data = {"url": PIXIV_URL + li.find_all("a", class_=re.compile("work  _work\w*"), limit=1)[0]['href'],
                    "title": li.find("h1", attrs={"class": "title"}).text}
            # 非关键信息 解析失败不影响主要信息收集
            try:
                user = {}
                user_a = li.find("a", attrs={"class": "user ui-profile-popup"})
                user["name"] = user_a["title"]
                user["id"] = user_a["data-user_id"]
                user["page"] = PIXIV_URL + user_a["href"]
                data["user"] = user
            except Exception as e:
                print("Parse User Warning")
                print(e.message)
            count_a = li.find("a", attrs={"class": "bookmark-count _ui-tooltip"})
            if count_a:
                data["mark_count"] = li.find("a", attrs={"class": "bookmark-count _ui-tooltip"}).text
            else:
                data["mark_count"] = 0
            data = parse_dict(data)
            datas.append(data)
        except Exception as e:
            print("parse_search_result Warning")
            print(e.message)
            continue
    return datas