Beispiel #1
0
    def get_topics_list(self, response):
        """
        a list for all main topics
        :param response:
        :return:
        """
        cate_list = json.loads(response.body.decode())["data"]["list"]
        topic_list = []

        for cate in cate_list:
            for topic in cate["topics"]:
                topic_list.append(topic)

        for topic in topic_list:
            if topic.get("fid"):
                pass

            else:
                topic_id = topic.get("topic_id")
                for page in range(1, 21):
                    topics_url = "https://bbs.mobileapi.hupu.com/1/7.3.17/topics/threads"
                    client = utils.get_random_client()
                    params = {
                        'clientId': utils.get_random_clientId(),
                        'crt': int(time.time() * 1000),
                        'night': '0',
                        'stamp': 0,
                        '_ssid': utils.get_random_ssid(),
                        '_imei': utils.get_random_imei(),
                        'time_zone': 'Asia/Shanghai',
                        'tab_type': '2',
                        'client': client,
                        'topic_id': topic_id,
                        'page': page,
                        'android_id': client,
                    }
                    params["sign"] = utils.get_sign(params)
                    yield scrapy.Request(url=topics_url + "?" +
                                         urlencode(params),
                                         dont_filter=True,
                                         callback=self.get_sub_topics,
                                         meta={"topic_id": topic_id})
Beispiel #2
0
    def start_requests(self):
        index_url = "https://bbs.mobileapi.hupu.com/1/7.3.17/topics"
        client = utils.get_random_client(),
        params = {
            'all': '1',
            'clientId': utils.get_random_clientId(),
            'crt': int(time.time() * 1000),
            'night': '0',
            'client': client,
            '_ssid': utils.get_random_ssid(),
            '_imei': utils.get_random_imei(),
            'time_zone': 'Asia/Shanghai',
            'android_id': client,
        }
        params["sign"] = utils.get_sign(params)

        yield scrapy.Request(
            url=index_url + "?" + urlencode(params),
            dont_filter=True,
            callback=self.get_topics_list,
        )
Beispiel #3
0
 def get_fid(self, topic_id):
     topic_info_url = "https://bbs.mobileapi.hupu.com/1/7.3.17/topics/{}".format(
         topic_id)
     client = utils.get_random_client()
     headers = {
         "user-agent":
         "Mozilla/5.0 (Linux; Android 5.1; Google Build/LMY47D) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36 kanqiu/7.3.17.11347/7435 isp/1 network/WIFI",
     }
     params = {
         'clientId': utils.get_random_clientId(),
         'crt': int(time.time() * 1000),
         'night': '0',
         'client': client,
         '_ssid': utils.get_random_ssid(),
         '_imei': utils.get_random_imei(),
         'time_zone': 'Asia/Shanghai',
         'android_id': client,
     }
     params["sign"] = utils.get_sign(params)
     res = requests.get(topic_info_url, headers=headers, params=params)
     fid = json.loads(res.text)["data"]["topic"]["fid"]
     return fid
Beispiel #4
0
    def parse_user_detail(self, response):
        """
        parse detail info page for users
        :param response:
        :return:
        """
        try:
            user_json = json.loads(response.body.decode())["result"]
        except KeyError as e:
            print("source data: %s" % json.loads(response.body.decode()))
            return

        mongo_user = UserItem()  # save data to MongoDB
        mongo_user["puid"] = response.meta["puid"]
        mongo_user["nickname"] = user_json["nickname"]
        neo4j_user = User()  # save data to Neo4j
        neo4j_user.puid = str(response.meta["puid"])
        neo4j_user.name = user_json["nickname"]

        mongo_user["header_url"] = user_json["header"]
        mongo_user["level"] = user_json["level"]
        if re.search(r'\d+', user_json["reg_time_str"]):
            register_days = re.search(r'\d+',
                                      user_json["reg_time_str"]).group()
        else:
            register_days = 0
        mongo_user["register_date"] = self.get_interval_date(
            int(register_days))
        mongo_user["gender"] = user_json["gender"]
        mongo_user["location"] = user_json["location_str"]
        mongo_user["follow_count"] = int(user_json["follow_count"])
        mongo_user["fans_count"] = int(user_json["be_follow_count"])
        mongo_user["be_light_count"] = int(user_json["be_light_count"])
        mongo_user["be_recommend_count"] = int(user_json["be_recommend_count"])
        mongo_user["bbs_msg_count"] = int(user_json["bbs_msg_count"])
        mongo_user["bbs_post_count"] = int(user_json["bbs_post_count"])
        mongo_user["bbs_recommend_count"] = int(
            user_json["bbs_recommend_count"])
        mongo_user["news_comment_count"] = int(user_json["news_comment_count"])
        mongo_user["bbs_msg_url"] = user_json["bbs_msg_url"]
        mongo_user["bbs_post_url"] = user_json["bbs_post_url"]
        mongo_user["bbs_recommend_url"] = user_json["bbs_recommend_url"]
        mongo_user["news_comment_url"] = user_json["news_comment_url"]
        mongo_user["bbs_follow_url"] = user_json["bbs_follow_url"]
        bbs_follow = self.get_follow(mongo_user, neo4j_user)
        mongo_user["bbs_fans_url"] = user_json["bbs_be_follow_url"]
        bbs_fans = self.get_fans(mongo_user)
        mongo_user["bbs_job"] = user_json["bbs_job"]
        mongo_user["reputation"] = int(user_json["reputation"]["value"])
        mongo_user["update_time"] = self.get_interval_date(0, 1)

        yield mongo_user

        all_contacts = bbs_follow + bbs_fans
        if all_contacts:
            for user in all_contacts:
                user_url = "https://games.mobileapi.hupu.com/3/7.3.17/user/page"
                params = {
                    "client": utils.get_random_client(),
                }
                formdata = {
                    'puid': str(user["puid"]),
                    'time_zone': 'Asia/Shanghai',
                    'client': utils.get_random_client(),
                    'night': '0',
                    'crt': str(int(time.time())),
                    'advId': '4497039A-C3C3-4A56-8266-4032F636152D',
                    'clientId': utils.get_random_clientId(),
                    '_ssid': utils.get_random_ssid(),
                }
                formdata["sign"] = utils.get_sign(formdata)
                yield scrapy.FormRequest(url=user_url + "?" +
                                         urlencode(params),
                                         formdata=formdata,
                                         callback=self.parse_user_detail,
                                         priority=15,
                                         meta={"puid": int(user["puid"])})
Beispiel #5
0
    def get_replies(self, response):
        """
        parse replies in posts
        :param response:
        :return:
        """
        res_data = json.loads(response.body.decode())
        if not res_data["data"]["result"]["list"]:
            return
        current_page = int(response.meta["current_page"])
        all_page = int(res_data["data"]["result"]["all_page"])
        while current_page < all_page:
            current_page += 1
            posts_url = "https://bbs.mobileapi.hupu.com/1/7.3.17/threads/getsThreadPostList"
            client = utils.get_random_client()
            params = {
                'fid': response.meta["fid"],
                'clientId': utils.get_random_clientId(),
                'crt': int(time.time() * 1000),
                'night': '0',
                'maxpid': '',
                '_ssid': utils.get_random_ssid(),
                '_imei': utils.get_random_imei(),
                'sort': '0',
                'webp': '1',
                'time_zone': 'Asia/Shanghai',
                'tid': response.meta["tid"],
                'offline': 'json',
                'show_type': '',
                'postAuthorPuid': '',
                'client': client,
                'page': current_page,
                'android_id': client,
                'entrance': '9',
                'order': 'asc',
            }
            params["sign"] = utils.get_sign(params)
            yield scrapy.Request(
                url=posts_url + "?" + urlencode(params),
                callback=self.get_replies,
                priority=10,
                meta={
                    "current_page": current_page,
                    "fid": response.meta["fid"],
                    "tid": response.meta["tid"],
                },
            )

        replies_list = res_data["data"]["result"]["list"]
        for reply in replies_list:
            user_url = "https://games.mobileapi.hupu.com/3/7.3.17/user/page"
            params = {
                "client": utils.get_random_client(),
            }
            formdata = {
                'puid': str(reply["puid"]),
                'time_zone': 'Asia/Shanghai',
                'client': utils.get_random_client(),
                'night': '0',
                'crt': str(int(time.time())),
                'advId': '4497039A-C3C3-4A56-8266-4032F636152D',
                'clientId': utils.get_random_clientId(),
                '_ssid': utils.get_random_ssid(),
            }
            formdata["sign"] = utils.get_sign(formdata)
            yield scrapy.FormRequest(url=user_url + "?" + urlencode(params),
                                     formdata=formdata,
                                     callback=self.parse_user_detail,
                                     priority=15,
                                     meta={"puid": int(reply["puid"])})
Beispiel #6
0
    def get_sub_topics(self, response):
        """
        parse posts in sub_topics
        :param response:
        :return:
        """
        topic_id = response.meta["topic_id"]
        fid = self.get_fid(topic_id)
        posts_list = json.loads(response.body.decode())["data"]["list"]
        if not posts_list:
            return

        for post in posts_list:
            if post.get("is_ad", "") == 1:
                continue
            topic_data = TopicItem()
            topic_data["tid"] = post["tid"]
            launch_str = post["time"]
            if "天" in launch_str:
                days = int(re.search(r'\d+', launch_str).group())
                launch_time = self.get_interval_date(days)
            else:
                launch_time = self.get_interval_date(0)
            topic_data["launch_time"] = launch_time
            topic_data["title"] = post["title"]
            topic_data["user_name"] = post["user_name"]
            topic_data["update_time"] = self.get_interval_date(0, 1)
            yield topic_data

            posts_url = "https://bbs.mobileapi.hupu.com/1/7.3.17/threads/getsThreadPostList"
            client = utils.get_random_client()
            params = {
                'fid': fid,
                'clientId': utils.get_random_clientId(),
                'crt': int(time.time() * 1000),
                'night': '0',
                'maxpid': '',
                '_ssid': utils.get_random_ssid(),
                '_imei': utils.get_random_imei(),
                'sort': '0',
                'webp': '1',
                'time_zone': 'Asia/Shanghai',
                'tid': post["tid"],
                'offline': 'json',
                'show_type': '',
                'postAuthorPuid': '',
                'client': client,
                'page': '1',
                'android_id': client,
                'entrance': '9',
                'order': 'asc',
            }
            params["sign"] = utils.get_sign(params)
            yield scrapy.Request(url=posts_url + "?" + urlencode(params),
                                 callback=self.get_replies,
                                 priority=10,
                                 meta={
                                     "current_page": 1,
                                     "fid": fid,
                                     "tid": post["tid"]
                                 })