Esempio n. 1
0
    def start(cls):

        root_url = "http://www.mangabz.com/manga-list-p%d/"
        page_num = 0

        while True:
            cls.init_thread()

            page_num += 1
            print("Now page " + str(page_num))
            url = root_url % page_num
            resp = HttpUtils.get_with_retry(url, headers=cls.headers)
            if resp is None:
                break

            links = HttpUtils.get_attrs(resp, ".mh-item-detali > .title > a",
                                        "href")
            if len(links) == 0:
                break

            for link in links:
                cls.task_pool.put(link)

        cls.process_thread.join()
        cls.fp.close()
Esempio n. 2
0
    def parse_lvl_one(cls):
        if cls.book_id is None:
            print(">>>>> ERROR Cannot Parse Comic ID, QUIT! <<<<<")
            return

        resp = HttpUtils.get_with_retry("%s/%s/" % (cls.root_url, cls.book_id),
                                        headers=cls.headers)
        assert resp is not None

        cls.comic_name = HttpUtils.get_content(resp,
                                               ".detail-info-title").strip()
        cls.root_folder = os.path.join("output", cls.comic_name)
        links = HttpUtils.get_attrs(resp, "div.detail-list-form-con a", "href")

        titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a")
        image_numbers = HttpUtils.get_contents(
            resp, "div.detail-list-form-con a span")
        image_numbers = list(
            map(lambda x: re.search("(\d+)P", x).group(1), image_numbers))

        assert len(titles) == len(image_numbers)
        assert len(titles) == len(links)

        cnt = 0
        for index in range(len(titles)):
            cls.init_thread()

            link = links[index].replace("/", "").replace("m", "")
            title = titles[index].strip()
            image_number = image_numbers[index]
            if (cls.chapter_mode == 1 and "第" not in title and "话" not in title
                    and "話" not in title) or (cls.chapter_mode == 2
                                              and "卷" not in title
                                              and "第" not in title):
                print("Skip " + title)
                continue

            is_skip = False
            if cls.inclusion_list is not None:
                for inclusion in cls.inclusion_list:
                    if inclusion not in title:
                        is_skip = True
                        break

            if not is_skip and cls.parse_lvl_two((link, title, image_number)):
                cnt += 1

        if cnt > 0:
            cls.process_thread.join()

        # code below should be useless if everything goes well
        while not cls.task_pool.empty():
            print("pool size = " + str(cls.task_pool.qsize()))
            cls.init_thread()
            cls.process_thread.join()
Esempio n. 3
0
    def do_process(cls, link):
        resp = HttpUtils.get_with_retry(cls.root_url + link,
                                        headers=cls.headers)
        assert resp is not None

        cls.comic_name = HttpUtils.get_content(resp,
                                               ".detail-info-title").strip()
        comic_author = HttpUtils.get_content(
            resp, ".detail-info-tip span a").strip()
        comic_status = HttpUtils.get_content(
            resp, ".detail-info-tip span:nth-of-type(2) span").strip()
        titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a")

        # validation
        titles = list(map(lambda x: x.strip(), titles))
        if len(titles) == 0:
            return

        chap_ids = list()
        vol_ids = list()
        for title in titles:
            id = re.search(".+?(\d*).+?", title).group(1)
            if id == "":
                # print("Cannot parse: " + title)
                pass
            else:
                if "話" in title:
                    chap_ids.append(int(id))
                elif "卷" in title:
                    vol_ids.append(int(id))

        max_chap = -1
        max_vol = -1
        is_missed = False
        if len(chap_ids) > 0:
            missing_ids = list()
            chap_ids.sort()
            max_chap = chap_ids[-1]

            for i in range(1, max_chap + 1):
                if i not in chap_ids:
                    missing_ids.append(i)
            if len(missing_ids) > 0:
                # print("Missing chapters: " + str(missing_ids))
                is_missed = True

        if len(vol_ids) > 0:
            missing_ids = list()
            vol_ids.sort()
            max_vol = vol_ids[-1]

            for i in range(1, max_vol + 1):
                if i not in vol_ids:
                    missing_ids.append(i)
            if len(missing_ids) > 0:
                # print("Missing volumes: " + str(missing_ids))
                is_missed = True

        if not is_missed:
            # print(">>>>>>>>>>>> WOW! FULL SET: %s <<<<<<<<<<<<" % cls.comic_name)
            cls.output_pool.put((cls.comic_name, comic_author, comic_status,
                                 max_chap, max_vol, link))
Esempio n. 4
0
    def parse_lvl_two(cls, info):
        chapter_id = info[0]
        title = info[1]
        image_number = int(info[2])

        # create folder once
        folder_name = os.path.join(cls.root_folder, title)
        # folder_name = "output/" + cls.comic_name + "/" + title + "_" + chapter_id
        if not os.path.exists(folder_name):
            os.makedirs(folder_name, exist_ok=True)

        path_file_number = len(glob.glob(pathname=folder_name + '/*'))
        if path_file_number == image_number:
            print("Downloaded:" + title)
            # already downloaded all
            return False

        print("Start downloading: " + title)

        first_url = "http://www.mangabz.com/m%s/" % chapter_id

        headers = cls.headers
        headers["Cookie"] = headers["Cookie"] + urllib.parse.quote(first_url)
        headers["Referer"] = first_url

        index = 0
        while index < image_number:
            index += 1

            query_url = "%s/m%s/chapterimage.ashx?cid=%s&page=%d" % (
                cls.root_url, chapter_id, chapter_id, index)

            content = HttpUtils.get_with_retry(query_url,
                                               headers=headers,
                                               return_raw=True)

            if content.text.strip() == "":
                print("url: " + query_url)
                print("get wrong data: \"" + content.text.strip() + "\"")
                print("fail to parse image key, %s-%d" % (title, index))
            else:
                try:
                    image_url_list = execjs.eval(content.text)
                except:
                    print(">>>>>>>>>> fail to parse image: " + str(index))
                    continue

                assert len(image_url_list) > 0

                image_keys = list()
                for image_url in image_url_list:
                    match = re.search("/(\d+_\d{4}).(\w+)\?", image_url)
                    if match is not None:
                        image_key = match.group(1)
                        surfix = match.group(2)
                        image_keys.append(image_key)

                        file_path = folder_name + "/" + image_key + "." + surfix
                        cls.task_pool.put([file_path, image_url, 0])

                assert len(image_keys) > 0, query_url

                # sort & find largest image number
                image_keys.sort(key=lambda x: int(x.split("_")[0]))
                index = max(int(image_keys[-1].split("_")[0]), index)
                print("now index[%d], total[%d]" % (index, image_number))

        return True