def parse(self, response): item = ClubActivityFriendsDetailsItem() content = json.loads(response.body.decode(), strict=False) activity_friend_list = content["result"]["activityfriendlist"] # 活跃车友 club_master_list = content["result"]["clubmasterlist"] # 推荐车友 for club_master in club_master_list: item["bbs_id"] = self.club_id_list[self.club_index] item["user_id"] = club_master["userid"] item["recommend"] = 0 item["time"] = get_current_date() yield item for activity_friend in activity_friend_list: item["bbs_id"] = self.club_id_list[self.club_index] item["user_id"] = activity_friend["userid"] item["recommend"] = 1 item["time"] = get_current_date() yield item self.page_index += 1 if self.page_index <= content["result"]["pagecount"]: print(self.page_index) url = self.base_url % (self.club_id_list[self.club_index], self.page_index) print(url) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True) else: self.club_index += 1 if self.club_index < len(self.club_id_list): self.page_index = 1 url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse_club_circle_detail_items(self, response): item = response.meta['item'] content = json.loads(response.body.decode(), strict=False) circle_list = content["result"]["list"] for circle in circle_list: item["bbs_id"] = circle["bbsid"] item["circle_id"] = circle["circleid"] item["circle_name"] = circle["circlename"] item["user_count"] = circle["usercount"] item["province_id"] = circle["provinceid"] item["city_id"] = circle["cityid"] item["explain"] = circle["explain"] item["activen_num"] = circle["activennum"] item["create_time"] = circle["createtime"] item["last_update_time"] = circle["lastupdatetime"] item["owner_id"] = circle["ownerid"] item["time"] = get_current_date() yield item self.page_index += 1 if self.page_index <= content["result"]["pagecount"]: print(self.page_index) url = self.base_url % (self.club_id_list[self.club_index], self.page_index) print(url) yield Request(url=url, callback=self.parse_club_circle_detail_items, meta={"item": copy.deepcopy(item)}, dont_filter=True) else: self.club_index += 1 if self.club_index < len(self.club_id_list): self.page_index = 1 url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield Request(url=url, callback=self.parse_club_circle_detail_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
def parse(self, response): item = ClubCircleItem() content = json.loads(response.body.decode(), strict=False) result = content["result"] item["bbs_id"] = self.club_id_list[self.club_index] item["row_count"] = result["rowCount"] details = result["list"] pageCount = result["pageCount"] for detail in details: item["targetId"] = detail["targetId"] # 车友圈ID item["seriesId"] = detail["seriesId"] # 车系ID item["score"] = detail["score"] # 人气 item["title"] = detail["title"] # 车友圈 item["explain"] = detail["explain"] # 介绍 item["memberCount"] = detail["memberCount"] # 成员数量 item["time"] = get_current_date() yield item self.page_index += 1 if self.page_index < pageCount: url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield scrapy.Request(url, callback=self.parse) else: self.club_index += 1 if self.club_index < len(self.club_id_list): self.page_index = 1 url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse_article_comment_items(self, response): item = response.meta['item'] content = json.loads(response.body.decode(), strict=False) comment_list = content["result"]["list"] for comment in comment_list: item["article_id"] = self.article_list[self.article_index][0] item["comment_id"] = comment["id"] item["floor"] = comment["floor"] item["user_id"] = comment["nameid"] item["publish_time"] = comment["time"] item["content"] = comment["content"] item["update_time"] = get_current_date() yield item self.last_time = item["comment_id"] self.page_index += 1 if math.ceil(content["result"]["totalcount"] / 20) >= self.page_index: url = self.base_url % (self.article_list[self.article_index][0], self.last_time) yield Request(url=url, callback=self.parse_article_comment_items, meta={"item": copy.deepcopy(item)}, dont_filter=True) else: self.article_index += 1 if self.article_index < len(self.article_list): self.page_index = 1 self.last_time = 0 url = self.base_url % ( self.article_list[self.article_index][0], self.last_time) print(url) yield Request(url=url, callback=self.parse_article_comment_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
def parse_article_list_items(self, response): item = response.meta['item'] content = json.loads(response.body.decode(), strict=False) news_list = content["result"]["newslist"] item["last_time"] = content["result"]["pageid"] for news in news_list: news = news["data"] item["id"] = news["id"] item["title"] = news["title"] item["media_type"] = news["mediatype"] item["type"] = news["type"] item["publish_time"] = news["updatetime"] item["author"] = news["thirdsource"] try: item["ver"] = news["ver"] except Exception as e: item["ver"] = "" item["update_time"] = get_current_date() yield item self.last_time = item["last_time"] if len(news_list) == 30: url = self.base_url % (self.last_time, self.type_list[self.type_index]) yield Request(url=url, callback=self.parse_article_list_items, meta={"item": copy.deepcopy(item)}, dont_filter=True) else: self.type_index += 1 if self.type_index < len(self.type_list): self.last_time = 0 url = self.base_url % (self.last_time, self.type_list[self.type_index]) yield Request(url=url, callback=self.parse_article_list_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
def parse(self, response): item = KoubeiCommentItem() content = json.loads(response.body.decode()) result = content["result"] item["koubei_id"] = self.comment_list[self.comment_index] if len(result["list"]) > 0: for comment_list in result["list"]: item["id"] = comment_list["id"] item["user_id"] = comment_list["nameid"] item["content"] = comment_list["content"] item["carname"] = comment_list["carname"] create_time = comment_list["time"] if "前" in create_time: item["create_time"] = formmat_time(create_time) else: item["create_time"] = comment_list["time"] item["time"] = get_current_date() self.comment_id_list.append(str(comment_list["id"])) yield item if len(result["list"]) == 20 and (result["pageid"] == self.comment_id_list[-1]): url = self.base_url % (self.comment_list[self.comment_index], self.comment_id_list[-1]) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True) else: self.comment_index += 1 if self.comment_index < len(self.comment_list): self.last_id = 0 url = self.base_url % (self.comment_list[self.comment_index], self.last_id) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse_koubei_comment_item(self, response): item = response.meta['item'] content = json.loads(response.body.decode()) result = content["result"] item["koubei_id"] = self.comment_list[self.comment_index] if len(result["list"]) > 0: for comment_list in result["list"]: item["id"] = comment_list["id"] item["user_id"] = comment_list["nameid"] item["content"] = comment_list["content"] item["carname"] = comment_list["carname"] item["create_time"] = comment_list["time"] item["time"] = get_current_date() self.comment_id_list.append(str(comment_list["id"])) yield item if len(result["list"]) == 20 and (result["pageid"] == self.comment_id_list[-1]): url = self.base_url % (self.comment_list[self.comment_index], self.comment_id_list[-1]) yield Request(url=url, callback=self.parse_koubei_comment_item, meta={"item": copy.deepcopy(item)}, dont_filter=True) else: self.comment_index += 1 if self.comment_index < len(self.comment_list): self.last_id = 0 url = self.base_url % (self.comment_list[self.comment_index], self.last_id) yield Request(url=url, callback=self.parse_koubei_comment_item, meta={"item": copy.deepcopy(item)}, dont_filter=True)
def parse_club_topic_read_items(self, response): item = response.meta['item'] content = response.body.decode() content = re.search(r"{[^}]+}", content).group() content = json.loads(content, strict=False) item["topic_id"] = content["TopicId"] item["reply"] = content["Replys"] item["view"] = content["Views"] item["time"] = get_current_date() yield item self.topic_index += 1 if self.topic_index < len(self.topic_id_list): url = self.base_url % self.topic_id_list[self.topic_index] yield Request(url=url, callback=self.parse_club_topic_read_items, meta={"item": copy.deepcopy(item)}, dont_filter=True) else: self.offset += 1000 self.topic_id_list = StructureStartUrl().get_topic_id(self.offset) if 1000 >= len(self.topic_id_list) > 0: self.topic_index = 0 print(self.topic_id_list[self.topic_index]) url = self.base_url % self.topic_id_list[self.topic_index] yield Request(url=url, callback=self.parse_club_topic_read_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
def parse(self, response): item = ClubTopicListItem() content = json.loads(response.body.decode(), strict=False) topic_list = content["result"]["list"] for topic in topic_list: item["topic_id"] = topic["topicid"] item["bbs_id"] = topic["bbsid"] item["title"] = topic["title"] item["user_id"] = topic["userid"] item["reply_counts"] = topic["replycounts"] post_topic_date = topic["posttopicdate"] if "前" in post_topic_date: item["post_topic_date"] = formmat_time(post_topic_date) else: item["post_topic_date"] = post_topic_date last_reply_date = topic["lastreplydate"] if "前" in post_topic_date: item["last_reply_date"] = formmat_time(last_reply_date) else: item["last_reply_date"] = last_reply_date item["topic_type"] = topic["topictype"] item["time"] = get_current_date() yield item self.page_index += 1 if self.page_index <= content["result"]["pagecount"]: url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True) else: self.club_index += 1 percent = self.club_index / len(self.club_id_list) sys.stdout.write("\r" + "抓取进度:%d%%(%d/%d)" % (percent * 100, self.club_index, len(self.club_id_list))) if self.club_index < len(self.club_id_list): self.page_index = 1 url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse_club_topic_list_items(self, response): item = response.meta['item'] content = json.loads(response.body.decode(), strict=False) topic_list = content["result"]["list"] for topic in topic_list: item["topic_id"] = topic["topicid"] item["bbs_id"] = topic["bbsid"] item["title"] = topic["title"] item["user_id"] = topic["userid"] item["reply_counts"] = topic["replycounts"] item["post_topic_date"] = topic["posttopicdate"] item["last_reply_date"] = topic["lastreplydate"] item["topic_type"] = topic["topictype"] item["time"] = get_current_date() yield item self.page_index += 1 if self.page_index <= content["result"]["pagecount"]: url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield Request(url=url, callback=self.parse_club_topic_list_items, meta={"item": copy.deepcopy(item)}, dont_filter=True) else: self.club_index += 1 if self.club_index < len(self.club_id_list): self.page_index = 1 url = self.base_url % (self.club_id_list[self.club_index], self.page_index) yield Request(url=url, callback=self.parse_club_topic_list_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
def parse_spec_item(self, response): item = response.meta['item'] content = json.loads(response.body.decode()) for element in content: s_list = element["slist"] for spec in s_list: item["spec_id"] = spec["SpecId"] item["price"] = spec["Price"] item["time"] = get_current_date() yield item
def parse_user_cars_item(self, response): item = response.meta["item"] content = json.loads(response.body.decode('gbk')) concern_info_list = content["ConcernInfoList"] for concern in concern_info_list: item["spec_id"] = concern["SpecId"] item["cert_date"] = timestamp_to_localtime( int(concern["datetime"].rstrip("/)").lstrip("/Date("))) item["time"] = get_current_date() yield item
def parse_article_pv_items(self, response): item = response.meta['item'] content = re.search(r"{[^}]+}][^}]+}", response.body.decode()).group() content = json.loads(content, strict=False) result = content["result"][0] item["article_id"] = result["id"] item["pv_count"] = result["pvcount"] item["update_time"] = get_current_date() yield item self.article_index += 1 if len(self.article_list) > self.article_index: url = self.base_url % (self.article_list[self.article_index][0]) yield Request(url=url, callback=self.parse_article_pv_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
def parse(self, response): item = ClubActivityFriendsItem() content = json.loads(response.body.decode(), strict=False) result = content["result"] item["bbs_id"] = self.club_id_list[self.club_index] item["activity_friend_count"] = result["activityfriendcount"] item["time"] = get_current_date() yield item self.club_index += 1 if self.club_index < len(self.club_id_list): url = self.base_url % self.club_id_list[self.club_index] yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse_club_circle_items(self, response): item = response.meta['item'] content = json.loads(response.body.decode(), strict=False) result = content["result"] item["bbs_id"] = self.club_id_list[self.club_index] item["row_count"] = result["rowcount"] item["time"] = get_current_date() yield item self.club_index += 1 if self.club_index < len(self.club_id_list): url = self.base_url % self.club_id_list[self.club_index] yield Request(url=url, callback=self.parse_club_circle_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
def parse(self, response): item = KoubeiReadItem() content = json.loads(response.body.decode()) item["koubei_id"] = content["result"]["eid"] node = content["result"] item["visit_count"] = node["visitcount"] item["helpful_count"] = node["helpfulcount"] item["comment_count"] = node["commentcount"] item["time"] = get_current_date() yield item self.koubei_index += 1 if self.koubei_index < len(self.koubei_list): url = self.base_url % (self.koubei_list[self.koubei_index]) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse_dealer(self, response): content = json.loads(response.body.decode()) item = response.meta['item'] item["company"] = content["Company"] item["company_simple"] = content["CompanySimple"] item["address"] = content["Address"] item["pid"] = content["PID"] item["cid"] = content["CID"] item["sid"] = content["SID"] item["business_area"] = content["OrderRangeTitle"] item["lon"] = content["MapLonBaidu"] item["lat"] = content["MapLatBaidu"] item["kind_id"] = content["KindID"] item["star_level"] = content["StarLevel"] item["update_time"] = get_current_date() return item
def parse(self, response): result = json.loads(response.body.decode())["result"] item = KoubeiRankScrapyItem() for series in result["serieslist"]: item["level_id"] = result["categoryid"] item["series_id"] = series["seriesid"] item["koubei_rank"] = series["rank"] item["koubei_score"] = series["score"] item["koubei_evaluation_count"] = series["evaluationcount"] item["koubei_update_time"] = get_current_date() yield item time.sleep(1) self.url_index += 1 if self.url_index < len(self.level_list): url = self.base_url % self.level_list[self.url_index] yield Request(url=url, callback=self.parse)
def parse_koubei_article_item(self, response): item = response.meta['item'] content = json.loads(response.body.decode()) item["koubei_id"] = content["result"]["eid"] node = content["result"] item["visit_count"] = node["visitcount"] item["helpful_count"] = node["helpfulcount"] item["comment_count"] = node["commentcount"] item["time"] = get_current_date() yield item self.koubei_index += 1 if self.koubei_index < len(self.koubei_list): url = self.base_url % (self.koubei_list[self.koubei_index]) yield Request(url=url, callback=self.parse_koubei_article_item, meta={"item": copy.deepcopy(item)}, dont_filter=True)
def parse(self, response): item = KoubeiTagNumItem() content = json.loads(response.body.decode()) result = content["result"] if len(result["structuredlist"]) > 0: structure = result["structuredlist"][0] for summary in structure["Summary"]: if summary["SummaryKey"] != 0: item["summary_key"] = summary["SummaryKey"] item["volume"] = summary["Volume"] item["time"] = get_current_date() yield item # 翻页操作 self.series_index += 1 if self.series_index < len(self.series_list): url = self.base_url % self.series_list[self.series_index][0] yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse(self, response): item = KoubeiArticleItem() content = json.loads(response.body.decode()) item["koubei_id"] = content["result"]["eid"] node_value = content["result"] for node in node_value: if re.search(r"[a-zA-Z]+Scene", node): item["feeling_name"] = node_value[node]["feelingname"] item["feeling"] = node_value[node]["feeling"] item["score"] = node_value[node]["score"] item["time"] = get_current_date() yield item # 翻页操作 self.koubei_index += 1 if self.koubei_index < len(self.koubei_list): url = self.base_url % (self.koubei_list[self.koubei_index]) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse_tag_item(self, response): item = response.meta["item"] content = json.loads(response.body.decode()) result = content["result"] if len(result["structuredlist"]) > 0: structure = result["structuredlist"][0] for summary in structure["Summary"]: if summary["SummaryKey"] != 0: item["summary_key"] = summary["SummaryKey"] item["volume"] = summary["Volume"] item["time"] = get_current_date() yield item # 翻页操作 self.series_index += 1 if self.series_index < len(self.series_list): url = self.base_url % self.series_list[self.series_index][0] yield Request(url=url, callback=self.parse_tag_item, meta={'item': copy.deepcopy(item)}, dont_filter=True)