class AutoHomeTopicInfoSpider(CrawlSpider): name = "club_topic_info" # club_id_list = [6, 13, 15, 16] club_id_list = StructureStartUrl().get_bbs_id() club_index = 0 base_url = "https://clubnc.app.autohome.com.cn/club_v8.2.0/club/topics-pm2-b%s-btc-r0-ss0-o0-p2-s50-qf0-c110100-t0-v8.8.0.json" start_urls = [base_url % club_id_list[club_index]] def parse(self, response): item = ClubTopicInfoScrapyItem() yield Request(url=response.url, callback=self.parse_club_topic_info_items, meta={"item": copy.deepcopy(item)}, dont_filter=True) def parse_club_topic_info_items(self, response): item = response.meta['item'] content = json.loads(response.body.decode(), strict=False) result = content["result"] item["bbs_id"] = result["clubid"] item["row_count"] = result["rowcount"] item["friend_count"] = result["friendcount"] item["time"] = get_current_date() yield item self.club_index += 1 if self.club_index < len(self.club_id_list): url = self.base_url % self.club_id_list[self.club_index] yield Request(url=url, callback=self.parse_club_topic_info_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
def parse_club_topic_read_items(self, response): item = response.meta['item'] content = response.body.decode() content = re.search(r"{[^}]+}", content).group() content = json.loads(content, strict=False) item["topic_id"] = content["TopicId"] item["reply"] = content["Replys"] item["view"] = content["Views"] item["time"] = get_current_date() yield item self.topic_index += 1 if self.topic_index < len(self.topic_id_list): url = self.base_url % self.topic_id_list[self.topic_index] yield Request(url=url, callback=self.parse_club_topic_read_items, meta={"item": copy.deepcopy(item)}, dont_filter=True) else: self.offset += 1000 self.topic_id_list = StructureStartUrl().get_topic_id(self.offset) if 1000 >= len(self.topic_id_list) > 0: self.topic_index = 0 print(self.topic_id_list[self.topic_index]) url = self.base_url % self.topic_id_list[self.topic_index] yield Request(url=url, callback=self.parse_club_topic_read_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
class AutoHomeCircleSpider(CrawlSpider): name = "club_circle" club_id_list = [6, 13, 15, 16] club_id_list = StructureStartUrl().get_bbs_id() club_index = 0 base_url = "https://club.app.autohome.com.cn/club_v8.2.0/club/getcarfriendcirclelist-pm2-utd5a4a902aa6c4db1b5ca0adb2df71dda03f29a2a-b%s-p1-s20.json" start_urls = [base_url % club_id_list[club_index]] def parse(self, response): item = ClubCircleScrapyItem() yield Request(url=response.url, callback=self.parse_club_circle_items, meta={"item": copy.deepcopy(item)}, dont_filter=True) def parse_club_circle_items(self, response): item = response.meta['item'] content = json.loads(response.body.decode(), strict=False) result = content["result"] item["bbs_id"] = self.club_id_list[self.club_index] item["row_count"] = result["rowcount"] item["time"] = get_current_date() yield item self.club_index += 1 if self.club_index < len(self.club_id_list): url = self.base_url % self.club_id_list[self.club_index] yield Request(url=url, callback=self.parse_club_circle_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
class AutoHomeClubActivityFriendDetailSpider(CrawlSpider): name = "club_activity_friend_detail" # club_id_list = [6, 13, 15, 16] club_id_list = StructureStartUrl().get_bbs_id() # club_id_list = [834] club_index = 0 page_index = 1 base_url = "https://club.app.autohome.com.cn/club_v8.2.0/club/getactivityfriendlist-pm2-b%s-t2-c0-u66230826-p%s-s20.json" start_urls = [base_url % (club_id_list[club_index], page_index)] def parse(self, response): item = ClubActivityFriendDetailScrapyItem() yield Request(url=response.url, callback=self.parse_club_activity_friend_detail_items, meta={"item": copy.deepcopy(item)}) def parse_club_activity_friend_detail_items(self, response): item = response.meta['item'] content = json.loads(response.body.decode(), strict=False) activity_friend_list = content["result"]["activityfriendlist"] # 活跃车友 club_master_list = content["result"]["clubmasterlist"] # 推荐车友 for club_master in club_master_list: item["bbs_id"] = self.club_id_list[self.club_index] item["user_id"] = club_master["userid"] item["recommend"] = 0 item["time"] = get_current_date() yield item for activity_friend in activity_friend_list: item["bbs_id"] = self.club_id_list[self.club_index] item["user_id"] = activity_friend["userid"] item["recommend"] = 1 item["time"] = get_current_date() yield item self.page_index += 1 if self.page_index <= content["result"]["pagecount"]: print(self.page_index) url = self.base_url % (self.club_id_list[self.club_index], self.page_index) print(url) yield Request( url=url, callback=self.parse_club_activity_friend_detail_items, meta={"item": copy.deepcopy(item)}, dont_filter=True) else: self.club_index += 1 if self.club_index < len(self.club_id_list): self.page_index = 1 url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield Request( url=url, callback=self.parse_club_activity_friend_detail_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
class AutoHomeTopicListSpider(CrawlSpider): name = "club_topic_list" # club_id_list = [6, 13, 15, 16] club_id_list = StructureStartUrl().get_bbs_id() club_index = 0 page_index = 1 base_url = "https://clubnc.app.autohome.com.cn/club_v8.2.0/club/topics-pm2-b%s-btc-r0-ss0-o0-p%s-s50-qf0-c110100-t0-v8.8.0.json" start_urls = [base_url % (club_id_list[club_index], page_index)] def parse(self, response): item = ClubTopicListScrapyItem() yield Request(url=response.url, callback=self.parse_club_topic_list_items, meta={"item": copy.deepcopy(item)}) def parse_club_topic_list_items(self, response): item = response.meta['item'] content = json.loads(response.body.decode(), strict=False) topic_list = content["result"]["list"] for topic in topic_list: item["topic_id"] = topic["topicid"] item["bbs_id"] = topic["bbsid"] item["title"] = topic["title"] item["user_id"] = topic["userid"] item["reply_counts"] = topic["replycounts"] item["post_topic_date"] = topic["posttopicdate"] item["last_reply_date"] = topic["lastreplydate"] item["topic_type"] = topic["topictype"] item["time"] = get_current_date() yield item self.page_index += 1 if self.page_index <= content["result"]["pagecount"]: url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield Request(url=url, callback=self.parse_club_topic_list_items, meta={"item": copy.deepcopy(item)}, dont_filter=True) else: self.club_index += 1 if self.club_index < len(self.club_id_list): self.page_index = 1 url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield Request(url=url, callback=self.parse_club_topic_list_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
def parse_club_topic_comment_items(self, response): i = 0 item = response.meta['item'] soup = BeautifulSoup(response.body.decode()) node_list = response.xpath("//*[@class=\"post-flow\"]/li") soup_node_list = soup.find_all('div', {'class': 'user-content'}) for node in node_list: item["topic_id"] = self.club_id_list[self.club_index] try: item["user"] = node.xpath(".//span[@class=\"name\"]/a/text()").extract()[0] except Exception as e: item["user"] = "" try: item["content"] = soup_node_list[i].get_text().strip("\n") except Exception as e: item["content"] = "" try: item["publish_time"] = node.xpath('.//*[@class="time"]/text()').extract()[0] except Exception as e: item["publish_time"] = "" try: item["id"] = node.xpath('.//span[@class="flowLevel"]/span/text()').extract()[0] except Exception as e: item["id"] = "" if item["id"] != "": yield item i += 1 self.page_index += 1 if 0 < len(soup_node_list) <= 20: url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield Request(url=url, callback=self.parse_club_topic_comment_items, meta={"item": copy.deepcopy(item)}) else: self.club_index += 1 if self.club_index < len(self.club_id_list): self.page_index = 1 url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield Request(url=url, callback=self.parse_club_topic_comment_items, meta={"item": copy.deepcopy(item)}) else: self.offset += 1000 self.club_id_list = StructureStartUrl().get_topic_id(self.offset) if 1000 >= len(self.club_id_list) > 0: self.club_index = 0 self.page_index = 1 url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield Request(url=url, callback=self.parse_club_topic_comment_items, meta={"item": copy.deepcopy(item)})
class AutoHomeCircleDetailSpider(CrawlSpider): name = "club_circle_detail" # club_id_list = [6, 13, 15, 16] club_id_list = StructureStartUrl().get_bbs_id() # club_id_list = [834] club_index = 0 page_index = 1 base_url = "https://club.app.autohome.com.cn/club_v8.2.0/club/getcarfriendcirclelist-pm2-utd5a4a902aa6c4db1b5ca0adb2df71dda03f29a2a-b%s-p%s-s20.json" start_urls = [base_url % (club_id_list[club_index], page_index)] def parse(self, response): item = ClubCircleDetailScrapyItem() yield Request(url=response.url, callback=self.parse_club_circle_detail_items, meta={"item": copy.deepcopy(item)}, dont_filter=True) def parse_club_circle_detail_items(self, response): item = response.meta['item'] content = json.loads(response.body.decode(), strict=False) circle_list = content["result"]["list"] for circle in circle_list: item["bbs_id"] = circle["bbsid"] item["circle_id"] = circle["circleid"] item["circle_name"] = circle["circlename"] item["user_count"] = circle["usercount"] item["province_id"] = circle["provinceid"] item["city_id"] = circle["cityid"] item["explain"] = circle["explain"] item["activen_num"] = circle["activennum"] item["create_time"] = circle["createtime"] item["last_update_time"] = circle["lastupdatetime"] item["owner_id"] = circle["ownerid"] item["time"] = get_current_date() yield item self.page_index += 1 if self.page_index <= content["result"]["pagecount"]: print(self.page_index) url = self.base_url % (self.club_id_list[self.club_index], self.page_index) print(url) yield Request(url=url, callback=self.parse_club_circle_detail_items, meta={"item": copy.deepcopy(item)}, dont_filter=True) else: self.club_index += 1 if self.club_index < len(self.club_id_list): self.page_index = 1 url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield Request(url=url, callback=self.parse_club_circle_detail_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
class AutoHomeTopicReadSpider(CrawlSpider): """帖子的阅读数""" name = "club_topic_read" # topic_id_list = [1570, 2715, 3246] offset = 0 topic_id_list = StructureStartUrl().get_topic_id(offset) topic_index = 0 base_url = "https://forum.app.autohome.com.cn/forum_v7.9.5/forum/club/topicaddclicksajax?topicid=%s" start_urls = [base_url % topic_id_list[topic_index]] def parse(self, response): item = ClubTopicReadScrapyItem() yield Request(url=response.url, callback=self.parse_club_topic_read_items, meta={"item": copy.deepcopy(item)}, dont_filter=True) def parse_club_topic_read_items(self, response): item = response.meta['item'] content = response.body.decode() content = re.search(r"{[^}]+}", content).group() content = json.loads(content, strict=False) item["topic_id"] = content["TopicId"] item["reply"] = content["Replys"] item["view"] = content["Views"] item["time"] = get_current_date() yield item self.topic_index += 1 if self.topic_index < len(self.topic_id_list): url = self.base_url % self.topic_id_list[self.topic_index] yield Request(url=url, callback=self.parse_club_topic_read_items, meta={"item": copy.deepcopy(item)}, dont_filter=True) else: self.offset += 1000 self.topic_id_list = StructureStartUrl().get_topic_id(self.offset) if 1000 >= len(self.topic_id_list) > 0: self.topic_index = 0 print(self.topic_id_list[self.topic_index]) url = self.base_url % self.topic_id_list[self.topic_index] yield Request(url=url, callback=self.parse_club_topic_read_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
class AutoHomeClubTopicListSpider(scrapy.Spider): name = 'auto_home_club_topic_list' club_id_list = StructureStartUrl().get_bbs_id() # club_id_list = [4744] club_index = 0 page_index = 1 base_url = "https://clubnc.app.autohome.com.cn/club_v8.2.0/club/topics-pm2-b%s-btc-r0-ss0-o0-p%s-s50-qf0-c110100-t0-v8.8.0.json" start_urls = [base_url % (club_id_list[club_index], page_index)] def parse(self, response): item = ClubTopicListItem() content = json.loads(response.body.decode(), strict=False) topic_list = content["result"]["list"] for topic in topic_list: item["topic_id"] = topic["topicid"] item["bbs_id"] = topic["bbsid"] item["title"] = topic["title"] item["user_id"] = topic["userid"] item["reply_counts"] = topic["replycounts"] post_topic_date = topic["posttopicdate"] if "前" in post_topic_date: item["post_topic_date"] = formmat_time(post_topic_date) else: item["post_topic_date"] = post_topic_date last_reply_date = topic["lastreplydate"] if "前" in post_topic_date: item["last_reply_date"] = formmat_time(last_reply_date) else: item["last_reply_date"] = last_reply_date item["topic_type"] = topic["topictype"] item["time"] = get_current_date() yield item self.page_index += 1 if self.page_index <= content["result"]["pagecount"]: url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True) else: self.club_index += 1 percent = self.club_index / len(self.club_id_list) sys.stdout.write("\r" + "抓取进度:%d%%(%d/%d)" % (percent * 100, self.club_index, len(self.club_id_list))) if self.club_index < len(self.club_id_list): self.page_index = 1 url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
class AutoHomeClubActivityFriendsSpider(scrapy.Spider): name = 'auto_home_club_activity_friends' club_id_list = StructureStartUrl().get_bbs_id() club_index = 0 base_url = "https://club.app.autohome.com.cn/club_v8.2.0/club/getactivityfriendlist-pm2-b%s-t2-c0-u66230826-p1-s20.json" start_urls = [base_url % club_id_list[club_index]] def parse(self, response): item = ClubActivityFriendsItem() content = json.loads(response.body.decode(), strict=False) result = content["result"] item["bbs_id"] = self.club_id_list[self.club_index] item["activity_friend_count"] = result["activityfriendcount"] item["time"] = get_current_date() yield item self.club_index += 1 if self.club_index < len(self.club_id_list): url = self.base_url % self.club_id_list[self.club_index] yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
class AutoHomeClubCircleSpider(scrapy.Spider): name = 'auto_home_club_circle' club_id_list = StructureStartUrl().get_bbs_id() club_index = 0 page_index = 1 # base_url = "https://club.app.autohome.com.cn/club_v8.2.0/club/getcarfriendcirclelist-pm2-utd5a4a902aa6c4db1b5ca0adb2df71dda03f29a2a-b%s-p1-s20.json" base_url = "https://chat.api.autohome.com.cn/c1/s1/api/getSeriesProvinceTagCyqList?cyqType=1&convertId=%s&memberId=0&pageIndex=%s&pageSize=15&_appid=club.pc" start_urls = [base_url % (club_id_list[club_index], page_index)] def parse(self, response): item = ClubCircleItem() content = json.loads(response.body.decode(), strict=False) result = content["result"] item["bbs_id"] = self.club_id_list[self.club_index] item["row_count"] = result["rowCount"] details = result["list"] pageCount = result["pageCount"] for detail in details: item["targetId"] = detail["targetId"] # 车友圈ID item["seriesId"] = detail["seriesId"] # 车系ID item["score"] = detail["score"] # 人气 item["title"] = detail["title"] # 车友圈 item["explain"] = detail["explain"] # 介绍 item["memberCount"] = detail["memberCount"] # 成员数量 item["time"] = get_current_date() yield item self.page_index += 1 if self.page_index < pageCount: url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield scrapy.Request(url, callback=self.parse) else: self.club_index += 1 if self.club_index < len(self.club_id_list): self.page_index = 1 url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def __init__(self): self.index = 1 self.base_url = "https://forum.app.autohome.com.cn/forum_v7.9.5/forum/club/topiccontent-a2-pm2-v8.8.0-t%s-o0-p1-s20-c1-nt0-fs0-sp0-al0-cw360-i0-ct1.json" self.topic_list = StructureStartUrl().get_topic_id()
class AutoHomeClubTopicCommentsSpider(scrapy.Spider): name = 'auto_home_club_topic_comments' club_index = 0 page_index = 1 club_id_list = StructureStartUrl().get_topic_id() base_url = "https://forum.app.autohome.com.cn/forum_v7.9.5/forum/club/topiccontent-a2-pm2-v8.8.0-t%s-o0-p%s-s20-c1-nt0-fs0-sp0-al0-cw360-i0-ct1.json" start_urls = [base_url % (club_id_list[club_index], page_index)] def parse(self, response): item = ClubTopicCommentsItem() i = 0 soup = BeautifulSoup(response.body.decode('utf-8')) node_list = response.xpath("//*[@class=\"post-flow\"]/li") soup_node_list = soup.find_all('div', {'class': 'user-content'}) for node in node_list: rule = get_rule(response.text) item["topic_id"] = self.club_id_list[self.club_index] try: item["user"] = node.xpath( ".//span[@class=\"name\"]/a/text()").extract()[0] except Exception as e: item["user"] = "" try: item["user_id"] = node.xpath( './/span[@class="name"]/a/@href').get().split("㊣")[1] except Exception as e: item["user_id"] = "" try: span_list = soup_node_list[i].find_all('span') for span in span_list: span.append(rule[span["class"][0]]) item["content"] = soup_node_list[i].get_text().strip("\n") except Exception as e: item["content"] = "该帖已删除" try: item["publish_time"] = node.xpath( './/*[@class="time"]/text()').extract()[0] except Exception as e: item["publish_time"] = "" try: item["id"] = node.xpath( './/span[@class="flowLevel"]/span/text()').extract()[0] except Exception as e: item["id"] = "" if item["id"] != "": yield item i += 1 self.page_index += 1 if 0 < len(soup_node_list) <= 20: url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield scrapy.Request(url=url, callback=self.parse) else: self.club_index += 1 if self.club_index < len(self.club_id_list): self.page_index = 1 url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield scrapy.Request(url=url, callback=self.parse)