Beispiel #1
0
    def parse(self, soup_obj):
        assert soup_obj is not None

        tr_list = soup_obj.select("table.torrents tr")

        seeds = []
        cnt = 0
        for tr in tr_list:
            cnt += 1
            if cnt == 1:
                # skip the caption tr
                continue

            seed = SeedInfo()
            td_list = tr.select("td.rowfollow")
            if len(td_list) < 9:
                # skip embedded contents
                continue

            seed.sticky = len(
                td_list[1].select("table td img[alt=\"Sticky\"]")) > 0
            seed.title = td_list[1].select("table td a")[0]["title"]
            seed.url = td_list[1].select("table td a")[0]['href']
            seed.free = len(td_list[1].select("table font.free")) > 0
            seed.hot = len(td_list[1].select("table font.hot")) > 0
            seed.since = HttpUtils.get_content(td_list[3], "span")
            seed.size = float(self.parse_size(td_list[4]))
            seed.upload_num = int(self.clean_tag(td_list[5]))
            seed.download_num = int(self.clean_tag(td_list[6]))
            seed.finish_num = int(self.clean_tag(td_list[7]))
            seed.id = self.parse_id(seed.url)

            seeds.append(seed)

        return seeds
Beispiel #2
0
    def parse_lvl_one(cls):
        if cls.book_id is None:
            return

        url = "http://www.js518.net/mohuanmanhua/%s/" % cls.book_id
        retry = 0
        while True:
            resp = HttpUtils.get(url)
            if resp is not None:
                break
            else:
                retry += 1

            assert retry < 5, "fail to query %s" % url

        cls.comic_name = HttpUtils.get_content(resp, "title").strip()
        links = HttpUtils.get_attrs(resp, "#mh-chapter-list-ol-0 a", "href")

        titles = HttpUtils.get_contents(resp, "#mh-chapter-list-ol-0 a")

        assert len(titles) == len(links)

        cls.init_thread()

        for index in range(len(titles)):
            link = links[index]
            title = titles[index].strip()
            cls.parse_lvl_two((link, title))
        cls.process_thread.join()

        # code below should be useless if everything goes well
        while not cls.task_pool.empty():
            print("pool size = " + str(cls.task_pool.qsize()))
            cls.init_thread()
            cls.process_thread.join()
Beispiel #3
0
    def get_score(self):
        self.check_in()

        soup = HttpUtils.get("http://www.miui.com/space-uid-2248502469.html")
        assert soup is not None
        score = HttpUtils.get_content(
            soup, "#statistic_content li:nth-of-type(1) a")
        return int(score)
Beispiel #4
0
    def read_msg(self, index):
        self.login_if_not()

        soup_obj = HttpUtils.get(self.url + index,
                                 headers=self.site.login_headers)
        assert soup_obj is not None

        tr_list = soup_obj.select("#outer form table tr")

        messages = []
        cnt = 0
        for tr in tr_list:
            cnt += 1
            if cnt == 1:
                # skip the caption tr
                continue

            td_list = tr.select("td.rowfollow")

            if len(td_list) < 4:
                # skip footer
                continue

            msg = Message()
            msg.read = len(td_list[0].select("img[alt=\"Read\"]")) > 0
            msg.title = HttpUtils.get_content(td_list[1], "a")
            msg.from_user = HttpUtils.get_content(td_list[2], "span a b")
            if msg.from_user is None:
                # for ad.
                msg.from_user = td_list[2].contents[0]
            msg.since = HttpUtils.get_content(td_list[3], "span")
            link = HttpUtils.get_attr(td_list[1], "a", "href")
            msg.id = link.split("id=")[1]
            messages.append(msg)

        print("--------------------------------------")
        index = 1
        for msg in messages:
            print("{:<2}|".format(index) + str(msg))
            index += 1
        print("--------------------------------------")

        return messages
Beispiel #5
0
    def parse_lvl_one(cls):
        if cls.book_id is None:
            print(">>>>> ERROR Cannot Parse Comic ID, QUIT! <<<<<")
            return

        resp = HttpUtils.get_with_retry("%s/%s/" % (cls.root_url, cls.book_id),
                                        headers=cls.headers)
        assert resp is not None

        cls.comic_name = HttpUtils.get_content(resp,
                                               ".detail-info-title").strip()
        cls.root_folder = os.path.join("output", cls.comic_name)
        links = HttpUtils.get_attrs(resp, "div.detail-list-form-con a", "href")

        titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a")
        image_numbers = HttpUtils.get_contents(
            resp, "div.detail-list-form-con a span")
        image_numbers = list(
            map(lambda x: re.search("(\d+)P", x).group(1), image_numbers))

        assert len(titles) == len(image_numbers)
        assert len(titles) == len(links)

        cnt = 0
        for index in range(len(titles)):
            cls.init_thread()

            link = links[index].replace("/", "").replace("m", "")
            title = titles[index].strip()
            image_number = image_numbers[index]
            if (cls.chapter_mode == 1 and "第" not in title and "话" not in title
                    and "話" not in title) or (cls.chapter_mode == 2
                                              and "卷" not in title
                                              and "第" not in title):
                print("Skip " + title)
                continue

            is_skip = False
            if cls.inclusion_list is not None:
                for inclusion in cls.inclusion_list:
                    if inclusion not in title:
                        is_skip = True
                        break

            if not is_skip and cls.parse_lvl_two((link, title, image_number)):
                cnt += 1

        if cnt > 0:
            cls.process_thread.join()

        # code below should be useless if everything goes well
        while not cls.task_pool.empty():
            print("pool size = " + str(cls.task_pool.qsize()))
            cls.init_thread()
            cls.process_thread.join()
Beispiel #6
0
    def action(self, data):
        vote_url = "https://kp.m-team.cc/vote.php?tid=%s&type=1"
        success_cnt = 0
        for id in data:
            res_obj = HttpUtils.get(url=vote_url % id,
                                    headers=self.site.login_headers)
            msg = HttpUtils.get_content(res_obj, "#outer table h2")
            if msg == "操作成功":
                success_cnt += 1

        print("Vote success: " + str(success_cnt))
Beispiel #7
0
    def parse_page(self, soup_obj):
        tr_list = soup_obj.select("#torrent_table tr")

        seeds = []
        cnt = 0
        for tr in tr_list:
            cnt += 1
            if cnt == 1:
                # skip the caption tr
                continue

            seed = SeedInfo()
            td_list = tr.select("td")
            if len(td_list) < 10:
                continue

            seed.sticky = len(td_list[1].select("div img[alt=\"置顶\"]"))
            seed.title = HttpUtils.get_content(td_list[1].select("div a b"))
            seed.url = td_list[1].select("div a")[0]['href']
            seed.free = len(td_list[1].select("div a img[alt=\"free\"]")) > 0
            seed.since = HttpUtils.get_content(td_list[3], "span")
            seed.size = float(self.parse_size(td_list[4]))
            seed.upload_num = int(self.clean_tag(td_list[5]))
            seed.download_num = int(self.clean_tag(td_list[6]))
            seed.finish_num = int(self.clean_tag(td_list[7]))
            seed.id = self.parse_id(seed.url)

            # parse discount
            if len(td_list[1].select("table td font.halfdown")) > 0:
                seed.discount = 50
            elif len(td_list[1].select("table td font.d30down")) > 0:
                seed.discount = 30
            else:
                seed.discount = 100

            seeds.append(seed)

        return seeds
Beispiel #8
0
    def parse(self, soup_obj):
        assert soup_obj is not None

        tr_list = soup_obj.select("table.torrents tr")

        seeds = []
        cnt = 0
        for tr in tr_list:
            cnt += 1
            if cnt == 1:
                # skip the caption tr
                continue

            seed = SeedInfo()
            td_list = tr.select("td.rowfollow")
            if len(td_list) < 9:
                # skip embedded contents
                continue

            seed.since = HttpUtils.get_content(td_list[2], "span")
            seed.size = float(self.parse_size(td_list[3]))
            seed.upload_num = int(self.clean_tag(td_list[4]))
            seed.download_num = int(self.clean_tag(td_list[5]))
            seed.finish_num = int(self.clean_tag(td_list[6]))
            seed.done = self.clean_tag(td_list[7])
            seed.working = "peer-active" in td_list[7]['class']

            td_title = tr.select("td.torrenttr tr td")
            seed.sticky = len(td_title[0].select("img[alt=\"Sticky\"]"))
            seed.title = td_title[0].select("a")[0]["title"]
            seed.url = td_title[0].select("a")[0]['href']
            seed.free = len(td_title[0].select("img[alt=\"Free\"]")) > 0
            seed.hot = len(td_title[0].select("font.hot")) > 0
            if len(td_title[0].select("img[alt=\"50%\"]")) > 0:
                seed.discount = 50
            elif len(td_title[0].select("img[alt=\"30%\"]")) > 0:
                seed.discount = 30
            elif seed.free:
                seed.discount = 0
            else:
                seed.discount = 100
            seed.id = self.parse_id(seed.url)

            seeds.append(seed)

        print("Crawl: " + str(len(seeds)))
        if len(seeds) < 10:
            EmailSender.send(u"无法解析页面", Config.get("mteam_username"))

        return seeds
Beispiel #9
0
    def check_login(self, site):
        HttpUtils.create_session_if_absent()
        HttpUtils.load_cookie()

        soup_obj = HttpUtils.get(site.home_page, headers=site.login_headers)
        content = HttpUtils.get_content(soup_obj, site.login_verify_css_selector)
        print("Current user is " + str(content))
        result = content is not None and content == site.login_verify_str

        if result:
            HttpUtils.save_cookie()
        else:
            HttpUtils.clear_cookie()

        return result
Beispiel #10
0
    def parse_page(self, soup_obj):
        items = soup_obj.select("item")
        assert len(items) != 0

        seeds = []
        for item in items:
            try:
                info = HttpUtils.get_content(item, "title").split("[")

                seed = SeedInfo()

                seed.title = info[0].strip()
                seed.size = HttpUtils.pretty_format(info[1].split("]")[0], "MB")
                seed.url = HttpUtils.get_attr(item, "enclosure", "url")
                seed.id = self.parse_id(seed.url)
                #Cache().set(seed.id, str(seed))

                seeds.append(seed)
            except Exception as e:
                print(e.getMessage())

        return seeds
Beispiel #11
0
    def parse_page(self, soup_obj):
        items = soup_obj.select("item")
        assert len(items) != 0

        seeds = []
        for item in items:
            try:
                info = HttpUtils.get_content(item, "title").split("[")

                seed = SeedInfo()

                seed.title = info[0].strip()
                seed.size = HttpUtils.pretty_format(info[1].split(" ")[-2] + info[1].split(" ")[-1], "MB")
                # seed.url = HttpUtils.get_content(item, "link")
                seed.url = item.contents[4]
                seed.id = self.parse_id(seed.url)

                seeds.append(seed)
            except Exception as e:
                pass

        return seeds
    def crawl_book(cls):

        tag_source_url = "https://book.douban.com/tag/"
        soup_obj = HttpUtils.get(tag_source_url)

        tags = HttpUtils.get_contents(soup_obj, "div.article tr td a")

        tags = [
            '小说', '外国文学', '文学', '中国文学', '经典', '日本文学', '古典文学', '王小波', '当代文学',
            '钱钟书', '外国名著', '推理', '绘本', '青春', '东野圭吾', '科幻', '言情', '悬疑', '奇幻',
            '韩寒', '推理小说', '阿加莎·克里斯蒂', '科幻小说', '魔幻', '历史', '心理学', '哲学', '传记',
            '文化', '社会学', '艺术', '设计', '社会', '政治', '建筑', '宗教', '电影', '政治学', '数学',
            '中国历史', '回忆录', '思想', '国学', '人物传记', '人文', '音乐', '艺术史', '绘画', '戏剧',
            '西方哲学', '二战', '军事', '佛教', '近代史', '考古', '自由主义', '美术', '爱情', '旅行',
            '成长', '生活', '心理', '励志', '摄影', '教育', '游记', '灵修', '健康', '情感', '两性',
            '人际关系', '手工', '养生', '家居', '自助游', '经济学', '管理', '经济', '商业', '金融',
            '投资', '营销', '理财', '创业', '广告', '股票', '企业史', '策划', '科普', '互联网', '编程',
            '科学', '交互设计', '用户体验', '算法', '科技', 'web', 'UE', '交互', '通信', 'UCD',
            '神经网络', '程序'
        ]
        print(tags)

        book_shelf = dict()
        for tag in tags:
            for page in range(0, 10):
                url = "https://book.douban.com/tag/%s?start=%d&type=T" % (
                    tag, page * 20)
                soup_obj = HttpUtils.get(url)

                if soup_obj is None:
                    print("blocked?")
                    break

                print(tag, page)
                books_obj = soup_obj.select("#subject_list ul > li")

                if len(books_obj) == 0:
                    break

                for book_obj in books_obj:
                    try:
                        title = HttpUtils.get_attr(book_obj, "h2 a", "title")
                        rating = float(
                            HttpUtils.get_content(book_obj,
                                                  "span.rating_nums"))
                        people = int(
                            HttpUtils.get_content(book_obj,
                                                  "span.pl").strip().replace(
                                                      "人评价", "").replace(
                                                          "(",
                                                          "").replace(")", ""))

                        if people > cls.people_threshold:
                            if title in book_shelf:
                                book_shelf[title].tag.append(tag)
                            else:
                                book_shelf[title] = Book(
                                    title, rating, people, [tag])
                    except Exception as e:
                        pass

                # 为了应对时间窗口内单 ip 访问数量限制,只是停顿一下
                sleep(random() * 0.5 + 0.5)

        books = list(book_shelf.values())

        with open("douban_book_raw.txt", "w") as fp:
            fp.write(json.dumps(books, default=Book.convert))
Beispiel #13
0
    def crawl_single(self, user_id):

        if self.skip_if_exist and self.cache.hash_get(self.id_bucket_name,
                                                      user_id) is not None:
            print("Skip " + str(user_id))
            return

        try:
            url = self.site.home_page % str(user_id)
            soup_obj = HttpUtils.get(url,
                                     headers=self.site.login_headers,
                                     return_raw=False)
            assert soup_obj is not None

            user = User()
            user.id = user_id
            user.name = HttpUtils.get_content(soup_obj, "#outer h1 span b")

            if user.name is None:
                return

            user.is_warn = len(
                soup_obj.select("#outer h1 span img[alt='Leechwarned']")) > 0
            user.is_ban = len(
                soup_obj.select("#outer h1 span img[alt='Disabled']")) > 0
            if user.is_warn:
                user.warn_time = str(time.strftime("%Y-%m-%d %H:%M:%S"))

            try:
                if len(soup_obj.select("#outer table tr")) <= 5:
                    user.is_secret = True
                    # print("secret user: name={0} id={1}".format(user.name, str(user_id)))
                else:
                    tr_list = soup_obj.select("#outer table tr")
                    for tr in tr_list:
                        td_name = HttpUtils.get_content(
                            tr, "td:nth-of-type(1)")
                        if td_name == "加入日期":
                            user.create_time = HttpUtils.get_content(
                                tr, "td:nth-of-type(2)").replace(" (", "")
                        elif td_name == "最近動向":
                            user.last_time = HttpUtils.get_content(
                                tr, "td:nth-of-type(2)").replace(" (", "")
                        elif td_name == "傳送":
                            user.ratio = HttpUtils.get_content(
                                tr, "td:nth-of-type(2) table tr td font")
                            if user.ratio is None:
                                # seems that no download is made and ratio is infinite
                                user.ratio = -1
                                user.up = self.parse_size_in_gb(
                                    HttpUtils.get_content(
                                        tr,
                                        "td:nth-of-type(2) table tr:nth-of-type(1) td:nth-of-type(1)",
                                        1))
                                user.down = self.parse_size_in_gb(
                                    HttpUtils.get_content(
                                        tr,
                                        "td:nth-of-type(2) table tr:nth-of-type(1) td:nth-of-type(2)",
                                        2))
                            else:
                                user.ratio = user.ratio.replace(",", "")
                                user.up = self.parse_size_in_gb(
                                    HttpUtils.get_content(
                                        tr,
                                        "td:nth-of-type(2) table tr:nth-of-type(2) td:nth-of-type(1)",
                                        1))
                                user.down = self.parse_size_in_gb(
                                    HttpUtils.get_content(
                                        tr,
                                        "td:nth-of-type(2) table tr:nth-of-type(2) td:nth-of-type(2)",
                                        2))
                        elif td_name == "魔力值":
                            user.mp = HttpUtils.get_content(
                                tr, "td:nth-of-type(2)")

                    # parse rank
                    user.rank = "secret"
                    imgs = soup_obj.select(
                        "table.main table tr > td > img[title!='']")
                    for img in imgs:
                        if not img.has_attr("class"):
                            user.rank = img["title"]

                            if "Peasant" in user.rank:
                                user.warn_time = str(
                                    time.strftime("%Y-%m-%d %H:%M:%S"))
                                # print("###### find user="******" id=" + str(user_id) + " rank=" + user.rank)
            except Exception as e:
                print(str(user_id) + "\n" + str(e) + "\n")

            self.buffer.append(user)
        except Exception as e:
            print(">>>>> fail to parse " + str(user_id))
            self.errors.append(user_id)
Beispiel #14
0
    def water_copy(self):
        self.check_in()

        forum_id_list = ["772", "773"]
        forum_id = forum_id_list[int(random() * len(forum_id_list)) - 1]
        article_url_template = "http://www.miui.com/forum.php?mod=forumdisplay&fid={0}&orderby=replies&filter=reply&orderby=replies&page={1}"
        page_num = 1
        max_cnt = 50

        reply_list = dict()
        stop_flag = False
        while not stop_flag:
            soup_obj = HttpUtils.get(
                article_url_template.format(forum_id, page_num))
            print("current page: " + str(page_num))
            page_num += 1

            article_list = soup_obj.select("tbody")

            for article in article_list:
                id = article.attrs["id"]
                if not id.startswith("normalthread"):
                    continue

                id = id[13:]

                if Cache().get(id) is not None:
                    print("Skip " + id)
                    # has been replied within a few days, skip
                    continue

                title = HttpUtils.get_content(article,
                                              ".sub-tit > a:nth-of-type(1)")
                # don't want to copy comments of author
                author = HttpUtils.get_content(article,
                                               ".sub-infos a:nth-of-type(1)")
                reply_num = HttpUtils.get_content(
                    article, "span.number_d a:nth-of-type(1)")

                total_thread_page_num = int(int(reply_num) / 10)
                start_thread_page_num = int(total_thread_page_num / 3)
                end_thread_page_num = start_thread_page_num * 2
                current_thread_page_num = start_thread_page_num + int(
                    random() * 3)

                content_candidates = list()

                while len(
                        content_candidates
                ) == 0 and current_thread_page_num <= end_thread_page_num:
                    page_url = self.page_url_template_copy.format(
                        id, current_thread_page_num)
                    current_thread_page_num += 1
                    page_soup_obj = HttpUtils.get(
                        page_url, headers=self.site.login_headers)
                    assert page_soup_obj is not None

                    # check if allow to reply
                    edit_content = HttpUtils.get_content(
                        page_soup_obj, "#fastposteditor .pt")
                    if edit_content is not None and "您现在无权发帖" in str(
                            edit_content):
                        Cache().set(id, "")
                        print(id + " not allowed to reply")
                        break

                    # skip vote(less score)
                    form = page_soup_obj.select("#poll", limit=1)
                    if form is not None and len(form) > 0:
                        Cache().set(id, "")
                        print(id + " skip vote")
                        break

                    post_list = page_soup_obj.select("#postlist > div")
                    for post in post_list:
                        try:
                            current_author = HttpUtils.get_content(
                                post, ".authi a")
                            if current_author == author:
                                continue

                            score = int(
                                HttpUtils.get_content(post, ".pil dd a"))
                            if score < 1500:
                                continue

                            content = HttpUtils.get_content(
                                post, ".pct table tr td.t_f")
                            if content is None or content.strip(
                            ) == "" or len(content) < 10 or len(content) > 50:
                                continue

                            if author in content:
                                continue

                            contain_black_list = False
                            for black_word in self.comments_black_list:
                                if black_word in content:
                                    contain_black_list = True
                                    break

                            if contain_black_list:
                                continue

                            content_candidates.append(content.strip())
                        except:
                            pass

                print(title)
                print(content_candidates)
                if len(content_candidates) > 0:
                    # randomly pick one
                    reply_list[id] = content_candidates[
                        int(random() * len(content_candidates)) - 1]
                    print(id + " -- " + reply_list[id])

                print("current reply=" + str(len(reply_list)))
                if len(reply_list) >= max_cnt:
                    stop_flag = True
                    break

        # start reply
        for thread_id in reply_list:
            try:
                message = reply_list[thread_id]
                post_data = dict()
                post_data["posttime"] = str(int(time.time()))
                post_data["formhash"] = self.form_hash_mirror
                post_data["usesig"] = "1"
                post_data["subject"] = "  "
                post_data["message"] = message

                form_submit_url = "http://www.miui.com/forum.php?mod=post&action=reply&fid={0}&tid={1}&extra=page=1&replysubmit=yes&infloat=yes&handlekey=fastpost".format(
                    forum_id, thread_id)
                print(thread_id, message, self.get_score())

                post_result = HttpUtils.post(form_submit_url,
                                             headers=self.site.login_headers,
                                             data=post_data,
                                             returnRaw=False)
                assert post_result is not None
                Cache().set_with_expire(thread_id, message, 86400 * 4)
                time.sleep(int(random() * 60) + 90)
            except:
                pass
Beispiel #15
0
    def zz_copy(self):
        source_url_template = "http://www.miui.com/forum.php?mod=forumdisplay&fid=773&orderby=dateline&filter=author&orderby=dateline&page={0}"
        thread_url_template = "http://www.miui.com/thread-{0}-1-1.html"
        post_url = "http://www.miui.com/forum.php?mod=post&action=newthread&fid=773&extra=&topicsubmit=yes"
        min_page_num = 300

        self.check_in()

        title_white_list = ["问题", "探索版", "怎么", "什么"]
        title_black_list = ["内测", "发货", "积分", "在线"]

        page_num = min_page_num + int(random() * 700)
        max_cnt = 20
        article_candidates = dict()
        stop_flag = False
        while not stop_flag:
            try:
                soup_obj = HttpUtils.get(source_url_template.format(page_num))
                page_num -= 1
                assert soup_obj is not None
                print("current page: " + str(page_num))

                article_list = soup_obj.select("tbody")

                for article in article_list:
                    id = article.attrs["id"]
                    if not id.startswith("normalthread"):
                        continue

                    id = id[13:]

                    if Cache().get("ZZ_" + id) is not None:
                        print("Skip " + id)
                        # has been ZZed within a few days, skip
                        continue

                    title = HttpUtils.get_content(
                        article, ".sub-tit > a:nth-of-type(1)")
                    reply_num = int(
                        HttpUtils.get_content(
                            article, "span.number_d a:nth-of-type(1)"))

                    if reply_num > 8:
                        continue

                    is_white_list = False
                    for white_list in title_white_list:
                        if white_list in title:
                            is_white_list = True

                    if not is_white_list:
                        break

                    is_black_list = False
                    for black_list in title_black_list:
                        if black_list in title:
                            is_black_list = True

                    if is_black_list:
                        break

                    thread_soup_obj = HttpUtils.get(
                        thread_url_template.format(id))
                    assert thread_soup_obj is not None
                    content = HttpUtils.get_content(thread_soup_obj,
                                                    "#postlist > div .t_f")

                    if content is None or content.strip() == "":
                        continue

                    article_candidates[id] = (title, content.strip())

                    if len(article_candidates) >= max_cnt:
                        stop_flag = True
                        break
            except:
                pass

        for id in article_candidates:
            try:
                (title, message) = article_candidates[id]

                post_data = dict()
                post_data["posttime"] = str(int(time.time()))
                post_data["formhash"] = self.form_hash_mirror
                post_data["wysiwyg"] = "1"
                post_data["typeid"] = "7562"
                post_data["allownoticeauthor"] = "1"
                post_data["addfeed"] = "1"
                post_data["usesig"] = "1"
                post_data["save"] = ""
                post_data["uploadalbum"] = "-2"
                post_data["newalbum"] = "请输入相册名称"
                post_data["subject"] = title
                post_data["message"] = message

                print((title, message))

                post_result = HttpUtils.post(post_url,
                                             headers=self.site.login_headers,
                                             data=post_data,
                                             returnRaw=False)
                assert post_result is not None

                Cache().put("ZZ_" + id)

                time.sleep(int(random() * 300) + 1800)
            except:
                pass
Beispiel #16
0
    def vote(self):
        self.check_in()

        source_list_url_template = "http://www.miui.com/home.php?mod=space&uid=133153462&do=thread&view=me&order=dateline&from=space&page={0}"
        page_num = 1
        max_cnt = 10
        cnt = 0
        stop_flag = False
        while not stop_flag:
            soup = HttpUtils.get(source_list_url_template.format(page_num),
                                 headers=self.site.login_headers)
            assert soup is not None

            page_num += 1

            current_score = self.get_score()
            previous_score = current_score

            article_urls = HttpUtils.get_attrs(soup, "div.tl th > a", "href")
            for article_url in article_urls:
                try:
                    article_url = "http://www.miui.com/" + article_url
                    article_soup = HttpUtils.get(
                        article_url, headers=self.site.login_headers)
                    assert article_soup is not None
                    title = HttpUtils.get_content(article_soup, "title")
                    form = article_soup.select("#poll", limit=1)
                    option = article_soup.select("#option_1", limit=1)
                    if form is None or len(form) == 0:
                        continue
                    if option is None or len(option) == 0:
                        continue
                    print(title)

                    # do vote here
                    post_url = "http://www.miui.com/" + HttpUtils.get_attr(
                        article_soup, "#poll", "action") + "&inajax=1"

                    post_data = dict()
                    post_data["pollanswers[]"] = HttpUtils.get_attr(
                        article_soup, "#option_1", "value")
                    post_data["formhash"] = self.form_hash_mirror
                    post_result = HttpUtils.post(
                        post_url,
                        headers=self.site.login_headers,
                        data=post_data,
                        returnRaw=False)
                    assert post_result is not None

                    current_score = self.get_score()
                    print(previous_score)
                    print(current_score)

                    cnt += 1
                    if cnt >= max_cnt or previous_score == current_score:
                        stop_flag = True
                        break

                    previous_score = current_score
                    time.sleep(60)
                except:
                    pass
Beispiel #17
0
    def do_process(cls, link):
        resp = HttpUtils.get_with_retry(cls.root_url + link,
                                        headers=cls.headers)
        assert resp is not None

        cls.comic_name = HttpUtils.get_content(resp,
                                               ".detail-info-title").strip()
        comic_author = HttpUtils.get_content(
            resp, ".detail-info-tip span a").strip()
        comic_status = HttpUtils.get_content(
            resp, ".detail-info-tip span:nth-of-type(2) span").strip()
        titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a")

        # validation
        titles = list(map(lambda x: x.strip(), titles))
        if len(titles) == 0:
            return

        chap_ids = list()
        vol_ids = list()
        for title in titles:
            id = re.search(".+?(\d*).+?", title).group(1)
            if id == "":
                # print("Cannot parse: " + title)
                pass
            else:
                if "話" in title:
                    chap_ids.append(int(id))
                elif "卷" in title:
                    vol_ids.append(int(id))

        max_chap = -1
        max_vol = -1
        is_missed = False
        if len(chap_ids) > 0:
            missing_ids = list()
            chap_ids.sort()
            max_chap = chap_ids[-1]

            for i in range(1, max_chap + 1):
                if i not in chap_ids:
                    missing_ids.append(i)
            if len(missing_ids) > 0:
                # print("Missing chapters: " + str(missing_ids))
                is_missed = True

        if len(vol_ids) > 0:
            missing_ids = list()
            vol_ids.sort()
            max_vol = vol_ids[-1]

            for i in range(1, max_vol + 1):
                if i not in vol_ids:
                    missing_ids.append(i)
            if len(missing_ids) > 0:
                # print("Missing volumes: " + str(missing_ids))
                is_missed = True

        if not is_missed:
            # print(">>>>>>>>>>>> WOW! FULL SET: %s <<<<<<<<<<<<" % cls.comic_name)
            cls.output_pool.put((cls.comic_name, comic_author, comic_status,
                                 max_chap, max_vol, link))
Beispiel #18
0
    def water(self):
        self.check_in()

        url_prefix = "http://www.miui.com/forum.php?mod=forumdisplay&fid=5&orderby=dateline&filter=author&orderby=dateline&page="
        page = 1
        cnt = 1
        max_cnt = 50
        chinese_char = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]

        id_list = []
        while True:
            soup_obj = HttpUtils.get(url_prefix + str(page))
            print("new page: " + str(page))
            id_list.extend(HttpUtils.get_attrs(soup_obj, "tbody", "id"))

            page += 1

            if len(id_list) > max_cnt:
                break

        id_list = id_list[:max_cnt]
        for id in id_list:
            if not id.startswith("normalthread"):
                continue

            id = id[13:]
            page_url = self.page_url_template.format(id)

            page_soup_obj = HttpUtils.get(page_url)
            assert page_soup_obj is not None

            i = str(cnt)
            length = len(i)
            num = ""
            for index in range(length):
                num += chinese_char[int(i[index])]

            id_num = ""
            for index in range(len(id)):
                id_num += chinese_char[int(id[index])]

            random_id = str(int(random() * 1000000000000000))
            chinese_char = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]

            random_id_num = ""
            for index in range(len(random_id)):
                random_id_num += chinese_char[int(random_id[index])]

            title = HttpUtils.get_content(page_soup_obj,
                                          "title").strip().replace(
                                              "_灌者为王_MIUI论坛", "")

            message = "时间{0},帖子ID{1},标题\"{2}\",随机数{3},第{4}个积分,打扰".format(
                time.strftime("%b %d %Y %H:%M:%S", time.localtime()), id_num,
                title, random_id_num, num)
            # form_hash = page_soup_obj.select("input[name='formhash']")[0]["value"]
            post_data = dict()
            post_data["posttime"] = str(int(time.time()))
            post_data["formhash"] = self.form_hash_mirror
            post_data["usesig"] = "1"
            post_data["subject"] = "  "
            post_data["message"] = message

            form_submit_url = "http://www.miui.com/forum.php?mod=post&action=reply&fid=5&tid={0}&extra=page=1&replysubmit=yes&infloat=yes&handlekey=fastpost".format(
                id)

            # print(post_data)

            post_result = HttpUtils.post(form_submit_url,
                                         headers=self.site.login_headers,
                                         data=post_data,
                                         returnRaw=False)
            assert post_result is not None
            time.sleep(int(random() * 60) + 90)
            cnt += 1