Exemple #1
0
    def parse_lvl_one(cls):
        if cls.book_id is None:
            print(">>>>> ERROR Cannot Parse Comic ID, QUIT! <<<<<")
            return

        resp = HttpUtils.get_with_retry("%s/%s/" % (cls.root_url, cls.book_id),
                                        headers=cls.headers)
        assert resp is not None

        cls.comic_name = HttpUtils.get_content(resp,
                                               ".detail-info-title").strip()
        cls.root_folder = os.path.join("output", cls.comic_name)
        links = HttpUtils.get_attrs(resp, "div.detail-list-form-con a", "href")

        titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a")
        image_numbers = HttpUtils.get_contents(
            resp, "div.detail-list-form-con a span")
        image_numbers = list(
            map(lambda x: re.search("(\d+)P", x).group(1), image_numbers))

        assert len(titles) == len(image_numbers)
        assert len(titles) == len(links)

        cnt = 0
        for index in range(len(titles)):
            cls.init_thread()

            link = links[index].replace("/", "").replace("m", "")
            title = titles[index].strip()
            image_number = image_numbers[index]
            if (cls.chapter_mode == 1 and "第" not in title and "话" not in title
                    and "話" not in title) or (cls.chapter_mode == 2
                                              and "卷" not in title
                                              and "第" not in title):
                print("Skip " + title)
                continue

            is_skip = False
            if cls.inclusion_list is not None:
                for inclusion in cls.inclusion_list:
                    if inclusion not in title:
                        is_skip = True
                        break

            if not is_skip and cls.parse_lvl_two((link, title, image_number)):
                cnt += 1

        if cnt > 0:
            cls.process_thread.join()

        # code below should be useless if everything goes well
        while not cls.task_pool.empty():
            print("pool size = " + str(cls.task_pool.qsize()))
            cls.init_thread()
            cls.process_thread.join()
Exemple #2
0
    def fetch_sub_category(cls, category):
        # go into category to find sub-category info
        category_link = category[1]
        category_text = category[0]

        sub_category_data_list = list()

        if category_text in cls.category_black_list:
            return []

        soup_obj = HttpUtils.get(cls.amazon_base_url + category_link,
                                 headers=cls.amazon_headers)

        sub_category_text_list = HttpUtils.get_contents(
            soup_obj, "div.sg-col-inner li.s-navigation-indent-2 span a span")
        sub_category_link_list = HttpUtils.get_attrs(
            soup_obj, "div.sg-col-inner li.s-navigation-indent-2 span a",
            "href")

        if len(sub_category_link_list) != len(sub_category_text_list):
            print("ERROR: Sub-category number not correct")
            return []

        # no sub-category
        if len(sub_category_link_list) == 0:
            sub_category_text_list = [category_text]
            sub_category_link_list = [category_link]

        print("find lvl 2 categories for %s" % category_text)
        print(sub_category_text_list)

        # find sub-category page number
        for sub_index in range(0, len(sub_category_link_list)):
            sub_category_link = sub_category_link_list[sub_index]
            sub_category_text = sub_category_text_list[sub_index]
            soup_obj = HttpUtils.get(cls.amazon_base_url + sub_category_link,
                                     headers=cls.amazon_headers)
            page_info = HttpUtils.get_contents(
                soup_obj, "ul.a-pagination li.a-disabled")
            if len(page_info) == 2:
                max_page_num = page_info[1]
            elif len(page_info) == 0:
                # 没有分页
                max_page_num = 1
            else:
                # 5页以内
                max_page_num = HttpUtils.get_contents(
                    soup_obj, "ul.a-pagination li.a-normal a")[-1]

            print("cat=%s, sub-cat=%s, page=%s" %
                  (category_text, sub_category_text, max_page_num))
            sub_category_data_list.append((category_text, sub_category_text,
                                           sub_category_link, max_page_num))

        return sub_category_data_list
Exemple #3
0
    def check_and_notify(cls):
        url = "https://www.flyertea.com/forum.php?mod=forumdisplay&orderby=dateline&sum=226&fid=226&mobile=2"
        soup_obj = HttpUtils.get(url, return_raw=False)
        titles = list(map(lambda title: title.strip(), HttpUtils.get_contents(soup_obj, "div.n5sq_htmk p.n5_htnrbt")))
        readers = list(map(lambda x: int(x), HttpUtils.get_contents(soup_obj, "div.n5sq_htmk div.n5_hthfcs")))
        flowers = list(
            map(lambda x: int(x) if x else 0, HttpUtils.get_contents(soup_obj, "div.n5sq_htmk div.n5_htdzcs")))

        print(titles)
        print(readers)
        print(flowers)
Exemple #4
0
    def parse_lvl_one(cls):
        if cls.book_id is None:
            return

        url = "http://www.js518.net/mohuanmanhua/%s/" % cls.book_id
        retry = 0
        while True:
            resp = HttpUtils.get(url)
            if resp is not None:
                break
            else:
                retry += 1

            assert retry < 5, "fail to query %s" % url

        cls.comic_name = HttpUtils.get_content(resp, "title").strip()
        links = HttpUtils.get_attrs(resp, "#mh-chapter-list-ol-0 a", "href")

        titles = HttpUtils.get_contents(resp, "#mh-chapter-list-ol-0 a")

        assert len(titles) == len(links)

        cls.init_thread()

        for index in range(len(titles)):
            link = links[index]
            title = titles[index].strip()
            cls.parse_lvl_two((link, title))
        cls.process_thread.join()

        # code below should be useless if everything goes well
        while not cls.task_pool.empty():
            print("pool size = " + str(cls.task_pool.qsize()))
            cls.init_thread()
            cls.process_thread.join()
Exemple #5
0
    def crawl_sub_category_book(cls, sub_category_meta):
        ku_book_title_list = list()

        category_name = sub_category_meta[0]
        sub_category_name = sub_category_meta[1]
        sub_category_link = cls.amazon_base_url + sub_category_meta[2]
        page_num = int(sub_category_meta[3])

        for page in range(1, page_num + 1):
            print("reading cat=%s,sub-cat=%s,page=%s" %
                  (category_name, sub_category_name, page))
            url = sub_category_link.split("%page=")[0] + "&page=" + str(page)
            soup_obj = HttpUtils.get(url, headers=cls.amazon_headers)

            if soup_obj is None:
                print("blocked?")
                break

            title_list = HttpUtils.get_contents(
                soup_obj,
                "div.s-result-list div.sg-col-inner h2.a-size-mini span.a-size-medium"
            )
            current_page_title_list = list()
            for title in title_list:
                # remove meta info
                title = title.split("(")[0].split("(")[0].split("【")[0]
                ku_book_title_list.append(title)
                current_page_title_list.append(title)

            print(current_page_title_list)
            sleep(random() * 0.5 + 0.5)

        return ku_book_title_list
Exemple #6
0
    def parse_lvl_two(cls, info):
        chapter_url = info[0]
        title = info[1]

        # create folder once
        folder_name = "output/" + cls.comic_name + "/" + title
        if not os.path.exists(folder_name):
            os.makedirs(folder_name, exist_ok=True)

        #
        # path_file_number = len(glob.glob(pathname=folder_name + '/*'))
        # if path_file_number == image_number:
        #     print("下载完毕:" + title)
        #     # already downloaded all
        #     return

        print("开始下载: " + title)

        query_url = cls.root_url + chapter_url

        retry = 0
        while True:
            content = HttpUtils.get(query_url, headers=cls.headers)
            if content is not None:
                break
            else:
                retry += 1

        assert retry < 5, "fail to query %s" % query_url

        script_content = HttpUtils.get_contents(content, "script")
        print(script_content[2][1:].replace(";;", ";").replace(";", ";\n"))

        image_url_list = re.search("chapterImages.*=.*\[(.*)\];",
                                   script_content[2]).group(1).replace(
                                       "\"", "").split(",")

        path = re.search("chapterPath.*?=.*?\"(.*?)\";",
                         script_content[2]).group(1)

        assert len(image_url_list) > 0

        index = 1
        for image_url in image_url_list:
            full_image_url = "http://js1.zzszs.com.cn/" + path + image_url
            file_path = "%s/%03d_%s" % (folder_name, index, image_url)
            cls.task_pool.put([file_path, full_image_url, 0])
            index += 1
Exemple #7
0
    def fetch_meta_data(cls):
        with open("ku_meta.txt", "r", encoding="utf-8") as fp:
            if fp.readline():
                # already exist, skip
                return

        home_url = "https://www.amazon.cn/s?i=digital-text&rh=n%3A116087071%2Cn%3A116089071%2Cn%3A116176071%2Cn%3A1337022071&page=1"

        # find all category, sub-category and page number
        soup_obj = HttpUtils.get(home_url, headers=cls.amazon_headers)
        if soup_obj is None:
            print("ERROR: Cannot find category")
            return

        category_text_list = HttpUtils.get_contents(
            soup_obj,
            "#leftNav ul:nth-of-type(3) div li span a.s-ref-text-link span")
        category_link_list = HttpUtils.get_attrs(
            soup_obj,
            "#leftNav ul:nth-of-type(3) div li span a.s-ref-text-link", "href")

        if len(category_text_list) != len(category_link_list):
            print("ERROR: Category number not correct")
            return

        print("find lvl 1 categories:")
        print(category_text_list)

        category_list = list()
        for index in range(0, len(category_link_list)):
            category_list.append(
                (category_text_list[index], category_link_list[index]))

        parallel_template = ParallelTemplate(5)
        sub_category_data_list = parallel_template.run(cls.fetch_sub_category,
                                                       category_list)

        with open("ku_meta.txt", "w", encoding="utf-8") as fp:
            fp.write(json.dumps(sub_category_data_list))
Exemple #8
0
    def do_process(cls, link):
        resp = HttpUtils.get_with_retry(cls.root_url + link,
                                        headers=cls.headers)
        assert resp is not None

        cls.comic_name = HttpUtils.get_content(resp,
                                               ".detail-info-title").strip()
        comic_author = HttpUtils.get_content(
            resp, ".detail-info-tip span a").strip()
        comic_status = HttpUtils.get_content(
            resp, ".detail-info-tip span:nth-of-type(2) span").strip()
        titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a")

        # validation
        titles = list(map(lambda x: x.strip(), titles))
        if len(titles) == 0:
            return

        chap_ids = list()
        vol_ids = list()
        for title in titles:
            id = re.search(".+?(\d*).+?", title).group(1)
            if id == "":
                # print("Cannot parse: " + title)
                pass
            else:
                if "話" in title:
                    chap_ids.append(int(id))
                elif "卷" in title:
                    vol_ids.append(int(id))

        max_chap = -1
        max_vol = -1
        is_missed = False
        if len(chap_ids) > 0:
            missing_ids = list()
            chap_ids.sort()
            max_chap = chap_ids[-1]

            for i in range(1, max_chap + 1):
                if i not in chap_ids:
                    missing_ids.append(i)
            if len(missing_ids) > 0:
                # print("Missing chapters: " + str(missing_ids))
                is_missed = True

        if len(vol_ids) > 0:
            missing_ids = list()
            vol_ids.sort()
            max_vol = vol_ids[-1]

            for i in range(1, max_vol + 1):
                if i not in vol_ids:
                    missing_ids.append(i)
            if len(missing_ids) > 0:
                # print("Missing volumes: " + str(missing_ids))
                is_missed = True

        if not is_missed:
            # print(">>>>>>>>>>>> WOW! FULL SET: %s <<<<<<<<<<<<" % cls.comic_name)
            cls.output_pool.put((cls.comic_name, comic_author, comic_status,
                                 max_chap, max_vol, link))
Exemple #9
0
    def zz(self):
        source_url_template = "https://bh.sb/post/category/main/page/{0}/"
        post_url = "http://www.miui.com/forum.php?mod=post&action=newthread&fid=5&extra=&topicsubmit=yes"

        self.check_in()

        max_cnt = 10
        cnt = 0
        page_num = 1
        articles = list()
        stop_flag = False
        while not stop_flag:
            # get article of bhsb
            soup = HttpUtils.get(source_url_template.format(page_num))
            article_urls = HttpUtils.get_attrs(soup, "h2 a", "href")
            page_num += 1

            for article_index in range(len(article_urls)):
                article_url = article_urls[article_index]
                if Cache().get(article_url) is not None:
                    continue

                article_soup = HttpUtils.get(article_url)
                titles = HttpUtils.get_contents(article_soup,
                                                ".article-content p")

                title_cnt = int(len(titles) / 2)

                for title_index in range(0, title_cnt):
                    try:
                        title = titles[title_index * 2].split("】")[1]
                        image = titles[title_index * 2 + 1]

                        if type(image) != Tag:
                            continue

                        src = image.attrs["src"]
                        if src.endswith("jpg"):
                            continue

                        message = "好玩您就点个赞,不好笑请期待下一贴~\n"
                        message += "[img]{0}[/img]".format(src)

                        if Cache().get(title) is not None:
                            continue
                        Cache().set(title, message)

                        articles.append((title, message))

                        cnt += 1

                        if cnt >= max_cnt:
                            stop_flag = True
                            break
                    except:
                        pass

                if stop_flag:
                    break

                # only if all articles are included, then mark this url
                Cache().set(article_url, article_url)

        type_id_list = ["1629", "1631", "1633", "4481", "1641"]
        type_index = 0
        for (title, message) in articles:
            print((title, message))

            post_data = dict()
            post_data["posttime"] = str(int(time.time()))
            post_data["formhash"] = self.form_hash_mirror
            post_data["wysiwyg"] = "1"
            post_data["typeid"] = type_id_list[type_index]
            post_data["allownoticeauthor"] = "1"
            post_data["addfeed"] = "1"
            post_data["usesig"] = "1"
            post_data["save"] = ""
            post_data["uploadalbum"] = "-2"
            post_data["newalbum"] = "请输入相册名称"
            post_data["subject"] = title
            post_data["message"] = message

            post_result = HttpUtils.post(post_url,
                                         headers=self.site.login_headers,
                                         data=post_data,
                                         returnRaw=False)
            assert post_result is not None
            type_index = (type_index + 1) % len(type_id_list)
            time.sleep(int(random() * 300) + 2700)
    def crawl_book(cls):

        tag_source_url = "https://book.douban.com/tag/"
        soup_obj = HttpUtils.get(tag_source_url)

        tags = HttpUtils.get_contents(soup_obj, "div.article tr td a")

        tags = [
            '小说', '外国文学', '文学', '中国文学', '经典', '日本文学', '古典文学', '王小波', '当代文学',
            '钱钟书', '外国名著', '推理', '绘本', '青春', '东野圭吾', '科幻', '言情', '悬疑', '奇幻',
            '韩寒', '推理小说', '阿加莎·克里斯蒂', '科幻小说', '魔幻', '历史', '心理学', '哲学', '传记',
            '文化', '社会学', '艺术', '设计', '社会', '政治', '建筑', '宗教', '电影', '政治学', '数学',
            '中国历史', '回忆录', '思想', '国学', '人物传记', '人文', '音乐', '艺术史', '绘画', '戏剧',
            '西方哲学', '二战', '军事', '佛教', '近代史', '考古', '自由主义', '美术', '爱情', '旅行',
            '成长', '生活', '心理', '励志', '摄影', '教育', '游记', '灵修', '健康', '情感', '两性',
            '人际关系', '手工', '养生', '家居', '自助游', '经济学', '管理', '经济', '商业', '金融',
            '投资', '营销', '理财', '创业', '广告', '股票', '企业史', '策划', '科普', '互联网', '编程',
            '科学', '交互设计', '用户体验', '算法', '科技', 'web', 'UE', '交互', '通信', 'UCD',
            '神经网络', '程序'
        ]
        print(tags)

        book_shelf = dict()
        for tag in tags:
            for page in range(0, 10):
                url = "https://book.douban.com/tag/%s?start=%d&type=T" % (
                    tag, page * 20)
                soup_obj = HttpUtils.get(url)

                if soup_obj is None:
                    print("blocked?")
                    break

                print(tag, page)
                books_obj = soup_obj.select("#subject_list ul > li")

                if len(books_obj) == 0:
                    break

                for book_obj in books_obj:
                    try:
                        title = HttpUtils.get_attr(book_obj, "h2 a", "title")
                        rating = float(
                            HttpUtils.get_content(book_obj,
                                                  "span.rating_nums"))
                        people = int(
                            HttpUtils.get_content(book_obj,
                                                  "span.pl").strip().replace(
                                                      "人评价", "").replace(
                                                          "(",
                                                          "").replace(")", ""))

                        if people > cls.people_threshold:
                            if title in book_shelf:
                                book_shelf[title].tag.append(tag)
                            else:
                                book_shelf[title] = Book(
                                    title, rating, people, [tag])
                    except Exception as e:
                        pass

                # 为了应对时间窗口内单 ip 访问数量限制,只是停顿一下
                sleep(random() * 0.5 + 0.5)

        books = list(book_shelf.values())

        with open("douban_book_raw.txt", "w") as fp:
            fp.write(json.dumps(books, default=Book.convert))