class YizhoucpCrawl(object): __START_URL = "https://api.myrightone.com/api/feed/moment-list" __LIKE_PID_URL = "https://api.myrightone.com/api/feed/like" __CRACK_SIGN_URL = "http://wx.zxiaoji.com/cp" __HOST = "api.myrightone.com" def __init__(self, secret_key, token, user_id, check_code, log): self.log = log self.secret_key = secret_key self.user_id = user_id self.token = token self.check_code = check_code self.request = self.__init_reqeust() self.cp_mongo = MongDb(LocalMongoConfig.HOST, LocalMongoConfig.PORT, LocalMongoConfig.DB, LocalMongoConfig.USER, LocalMongoConfig.PASSWD, log=self.log) self.cp_table = "yizhou_cp" def __init_reqeust(self): headers = { "Host": self.__HOST, "App-Id": self.token.split("_")[0], "Platform": "ios", "Token": self.token, "User-Agent": "Right-iOS/3.33.2 (com.myrightone.datecha; build:224; iOS 12.1.2) Alamofire/4.8.0", "Accept": "*/*", "Accept-Encoding": "gzip;q=1.0, compress;q=0.5", "Accept-Language": "zh-Hans-CN;q=1.0, en-CN;q=0.9", } self.request = requests.Session() self.request.headers = headers return self.request def __get_sign(self, params): req = requests.get(self.__CRACK_SIGN_URL, params={ "secret_key": self.secret_key, "check_code": self.check_code, "params": json.dumps(params) }, timeout=30) req_json = req.json() if req_json.get("status") != 1: self.log.error("提取sign发生错误,错误原因是:") self.log.error(req_json.get("data")) return None return req_json.get("data") def get_moment_list(self): self.log.info("开始采集动态页") params = { "num": 20, "start": 0, "timestamp": int(time.time()), "type": "recommend", "user_id": self.user_id, "last_object_id": "", } sign = self.__get_sign(params) if not sign: return params["sign"] = sign resp = self.request.get(self.__START_URL, params=params, verify=False, timeout=30) resp_json = resp.json() return resp_json def like_sex(self, post_data, sex=2, exclude_cp=True): """ :param fid: 文章id :param sex: 性别 :return: """ is_cp = post_data.get('left_user', None) if exclude_cp and is_cp: self.log.info("过滤掉cp组") return False category = post_data.get("category") if category == "topic": self.log.info("过滤掉话题..") return False fid = post_data.get("fid") nick_name = post_data["user"].get("nickname") post_text = post_data["payload"].get("text") mongo_exists = self.__update_like_mongo(fid, nick_name, post_text) if mongo_exists == -1: self.log.info("之前已对这条数据点过赞了,跳过...") return False raw_sex = post_data["user"].get('sex') if raw_sex == sex: fid_params = { "cancel": "0", "fid": fid, "timestamp": "0", "user_id": self.user_id, } sign = self.__get_sign(fid_params) if not sign: return False fid_params["sign"] = sign resp = self.request.get(self.__LIKE_PID_URL, params=fid_params, verify=False, timeout=30) resp_json = resp.json() if resp_json.get("message") == "success": nick_name = post_data["user"].get("nickname") post_text = post_data["payload"].get("text") self.log.info("给用户({})发布的【{}】点赞成功".format( nick_name, post_text)) return True def start(self, *args, **kwargs): count = 0 like_count = 0 while True: count += 1 moment_data = self.get_moment_list() like_count_batch = 0 for per_post in moment_data["data"]["list"]: like_succeed = self.like_sex(per_post) if like_succeed: like_count_batch += 1 like_count += 1 time.sleep(random.randint(1, 2)) if like_count % 100 == 0: self.log.info("当前已经对 {} 位小姐姐点过赞了...".format(like_count)) self.log.info("当前已经遍历了第 {} 次动态".format(count)) time.sleep( random.randint(7 * like_count_batch, 10 * like_count_batch)) now = datetime.datetime.now() if now.hour in range(2, 6): time.sleep(random.randint(3600, 4000)) def __update_like_mongo(self, fid, nick_name, post_text): exist_data = self.cp_mongo.find_one(self.cp_table, {"_id": fid}) if exist_data: self.log.info(">>>找到相同的数据啦...") count = exist_data['count'] count += 1 exist_data.update({"count": count}) self.cp_mongo.insert_batch_data(self.cp_table, [exist_data]) return -1 new_data = { "_id": fid, "nick_name": nick_name, "post_text": post_text, "count": 1 } self.cp_mongo.insert_batch_data(self.cp_table, [new_data], insert=True) return 1
class DoubanCrawl(object): __START_URL = "https://www.douban.com/group/luohuzufang/discussion?start={}" __HOST = "www.douban.com" def __init__(self, page, log): self.__page = page self.log = log self.log.info("获得 {} 页之后的数据...".format(self.__page)) self.mongo = MongDb(LocalMongoConfig.HOST, LocalMongoConfig.PORT, LocalMongoConfig.DB, LocalMongoConfig.USER, LocalMongoConfig.PASSWD, log=self.log) self.table = "douban" self.request = self.__init_reqeust() self.douban_handler = DouBanInfoHandler() def __init_reqeust(self): headers = { "Host": self.__HOST, "Connection": "keep-alive", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "DNT": "1", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7,ja;q=0.6", } self.request = requests.Session() self.request.headers = headers return self.request def __get_page_data(self, page_num=0, start_url=None): url = start_url.format( page_num) if start_url else self.__START_URL.format(page_num) resp = self.request.get(url) if resp is None: self.log.error("请求列表页出错...") return -1 html_resp = html.fromstring(resp.text) # 遍历所有的帖子 discussion_extract = html_resp.xpath( '//div[@class="article"]//tr[@class=""]') item_list = [] for per_discussion in discussion_extract: title = per_discussion.xpath('./td[@class="title"]/a/@title')[0] detail_url = per_discussion.xpath( './td[@class="title"]/a/@href')[0] author = per_discussion.xpath('./td[2]/a/text()')[0] author_url = per_discussion.xpath('./td[2]/a/@href')[0] comment_count_raw = per_discussion.xpath('./td[3]/text()') comment_count = comment_count_raw[0] if comment_count_raw else 0 comment_date = per_discussion.xpath('./td[4]/text()')[0] # titles.append(title) extract_info = self.douban_handler.clean_data(title) item = { "title": title, "detail_url": detail_url, "author": author, "author_url": author_url, "comment_count": comment_count, "comment_date": comment_date, } new_item = {**extract_info, **item} # print(new_item) item_list.append(new_item) self.mongo.insert_batch_data(self.table, item_list, key="detail_url") def start(self, *args, **kwargs): for url in init_urls: self.log.info("当前采集小组的链接是:{}".format(url)) for i in tqdm(range(0, self.__page + 1)): self.log.info("当前即将采集第 {} 页".format(i)) grab_list_page_status = self.__get_page_data(i * 25, url) if grab_list_page_status == -1: self.log.info("当前采集列表页出错, 当前页面是第 {} 页".format(i)) continue self.log.info("当前页面采集完成: page = {}".format(i)) self.log.info("成功退出采集程序...")