Beispiel #1
0
class YizhoucpCrawl(object):
    __START_URL = "https://api.myrightone.com/api/feed/moment-list"
    __LIKE_PID_URL = "https://api.myrightone.com/api/feed/like"

    __CRACK_SIGN_URL = "http://wx.zxiaoji.com/cp"

    __HOST = "api.myrightone.com"

    def __init__(self, secret_key, token, user_id, check_code, log):
        self.log = log
        self.secret_key = secret_key
        self.user_id = user_id
        self.token = token
        self.check_code = check_code
        self.request = self.__init_reqeust()
        self.cp_mongo = MongDb(LocalMongoConfig.HOST,
                               LocalMongoConfig.PORT,
                               LocalMongoConfig.DB,
                               LocalMongoConfig.USER,
                               LocalMongoConfig.PASSWD,
                               log=self.log)

        self.cp_table = "yizhou_cp"

    def __init_reqeust(self):
        headers = {
            "Host": self.__HOST,
            "App-Id": self.token.split("_")[0],
            "Platform": "ios",
            "Token": self.token,
            "User-Agent":
            "Right-iOS/3.33.2 (com.myrightone.datecha; build:224; iOS 12.1.2) Alamofire/4.8.0",
            "Accept": "*/*",
            "Accept-Encoding": "gzip;q=1.0, compress;q=0.5",
            "Accept-Language": "zh-Hans-CN;q=1.0, en-CN;q=0.9",
        }
        self.request = requests.Session()
        self.request.headers = headers
        return self.request

    def __get_sign(self, params):
        req = requests.get(self.__CRACK_SIGN_URL,
                           params={
                               "secret_key": self.secret_key,
                               "check_code": self.check_code,
                               "params": json.dumps(params)
                           },
                           timeout=30)
        req_json = req.json()
        if req_json.get("status") != 1:
            self.log.error("提取sign发生错误,错误原因是:")
            self.log.error(req_json.get("data"))
            return None
        return req_json.get("data")

    def get_moment_list(self):
        self.log.info("开始采集动态页")
        params = {
            "num": 20,
            "start": 0,
            "timestamp": int(time.time()),
            "type": "recommend",
            "user_id": self.user_id,
            "last_object_id": "",
        }

        sign = self.__get_sign(params)
        if not sign:
            return
        params["sign"] = sign
        resp = self.request.get(self.__START_URL,
                                params=params,
                                verify=False,
                                timeout=30)
        resp_json = resp.json()
        return resp_json

    def like_sex(self, post_data, sex=2, exclude_cp=True):
        """
        :param fid: 文章id
        :param sex: 性别
        :return:
        """

        is_cp = post_data.get('left_user', None)
        if exclude_cp and is_cp:
            self.log.info("过滤掉cp组")
            return False
        category = post_data.get("category")
        if category == "topic":
            self.log.info("过滤掉话题..")
            return False

        fid = post_data.get("fid")
        nick_name = post_data["user"].get("nickname")
        post_text = post_data["payload"].get("text")

        mongo_exists = self.__update_like_mongo(fid, nick_name, post_text)
        if mongo_exists == -1:
            self.log.info("之前已对这条数据点过赞了,跳过...")
            return False

        raw_sex = post_data["user"].get('sex')

        if raw_sex == sex:
            fid_params = {
                "cancel": "0",
                "fid": fid,
                "timestamp": "0",
                "user_id": self.user_id,
            }
            sign = self.__get_sign(fid_params)
            if not sign:
                return False
            fid_params["sign"] = sign
            resp = self.request.get(self.__LIKE_PID_URL,
                                    params=fid_params,
                                    verify=False,
                                    timeout=30)
            resp_json = resp.json()
            if resp_json.get("message") == "success":
                nick_name = post_data["user"].get("nickname")
                post_text = post_data["payload"].get("text")
                self.log.info("给用户({})发布的【{}】点赞成功".format(
                    nick_name, post_text))
                return True

    def start(self, *args, **kwargs):
        count = 0
        like_count = 0
        while True:
            count += 1
            moment_data = self.get_moment_list()
            like_count_batch = 0
            for per_post in moment_data["data"]["list"]:
                like_succeed = self.like_sex(per_post)
                if like_succeed:
                    like_count_batch += 1
                    like_count += 1
                time.sleep(random.randint(1, 2))
                if like_count % 100 == 0:
                    self.log.info("当前已经对 {} 位小姐姐点过赞了...".format(like_count))
            self.log.info("当前已经遍历了第 {} 次动态".format(count))
            time.sleep(
                random.randint(7 * like_count_batch, 10 * like_count_batch))
            now = datetime.datetime.now()
            if now.hour in range(2, 6):
                time.sleep(random.randint(3600, 4000))

    def __update_like_mongo(self, fid, nick_name, post_text):
        exist_data = self.cp_mongo.find_one(self.cp_table, {"_id": fid})
        if exist_data:
            self.log.info(">>>找到相同的数据啦...")
            count = exist_data['count']
            count += 1
            exist_data.update({"count": count})
            self.cp_mongo.insert_batch_data(self.cp_table, [exist_data])
            return -1
        new_data = {
            "_id": fid,
            "nick_name": nick_name,
            "post_text": post_text,
            "count": 1
        }
        self.cp_mongo.insert_batch_data(self.cp_table, [new_data], insert=True)
        return 1
class DoubanCrawl(object):
    __START_URL = "https://www.douban.com/group/luohuzufang/discussion?start={}"

    __HOST = "www.douban.com"

    def __init__(self, page, log):
        self.__page = page
        self.log = log
        self.log.info("获得 {} 页之后的数据...".format(self.__page))
        self.mongo = MongDb(LocalMongoConfig.HOST,
                            LocalMongoConfig.PORT,
                            LocalMongoConfig.DB,
                            LocalMongoConfig.USER,
                            LocalMongoConfig.PASSWD,
                            log=self.log)
        self.table = "douban"
        self.request = self.__init_reqeust()
        self.douban_handler = DouBanInfoHandler()

    def __init_reqeust(self):
        headers = {
            "Host": self.__HOST,
            "Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Upgrade-Insecure-Requests": "1",
            "DNT": "1",
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7,ja;q=0.6",
        }
        self.request = requests.Session()
        self.request.headers = headers
        return self.request

    def __get_page_data(self, page_num=0, start_url=None):
        url = start_url.format(
            page_num) if start_url else self.__START_URL.format(page_num)
        resp = self.request.get(url)
        if resp is None:
            self.log.error("请求列表页出错...")
            return -1

        html_resp = html.fromstring(resp.text)

        # 遍历所有的帖子
        discussion_extract = html_resp.xpath(
            '//div[@class="article"]//tr[@class=""]')

        item_list = []
        for per_discussion in discussion_extract:
            title = per_discussion.xpath('./td[@class="title"]/a/@title')[0]
            detail_url = per_discussion.xpath(
                './td[@class="title"]/a/@href')[0]
            author = per_discussion.xpath('./td[2]/a/text()')[0]
            author_url = per_discussion.xpath('./td[2]/a/@href')[0]
            comment_count_raw = per_discussion.xpath('./td[3]/text()')
            comment_count = comment_count_raw[0] if comment_count_raw else 0
            comment_date = per_discussion.xpath('./td[4]/text()')[0]
            # titles.append(title)

            extract_info = self.douban_handler.clean_data(title)

            item = {
                "title": title,
                "detail_url": detail_url,
                "author": author,
                "author_url": author_url,
                "comment_count": comment_count,
                "comment_date": comment_date,
            }

            new_item = {**extract_info, **item}
            # print(new_item)
            item_list.append(new_item)
        self.mongo.insert_batch_data(self.table, item_list, key="detail_url")

    def start(self, *args, **kwargs):
        for url in init_urls:
            self.log.info("当前采集小组的链接是:{}".format(url))
            for i in tqdm(range(0, self.__page + 1)):
                self.log.info("当前即将采集第 {} 页".format(i))
                grab_list_page_status = self.__get_page_data(i * 25, url)
                if grab_list_page_status == -1:
                    self.log.info("当前采集列表页出错, 当前页面是第 {} 页".format(i))
                    continue
                self.log.info("当前页面采集完成: page = {}".format(i))
        self.log.info("成功退出采集程序...")