コード例 #1
0
class AutoHomeTopicInfoSpider(CrawlSpider):
    name = "club_topic_info"
    # club_id_list = [6, 13, 15, 16]
    club_id_list = StructureStartUrl().get_bbs_id()
    club_index = 0
    base_url = "https://clubnc.app.autohome.com.cn/club_v8.2.0/club/topics-pm2-b%s-btc-r0-ss0-o0-p2-s50-qf0-c110100-t0-v8.8.0.json"
    start_urls = [base_url % club_id_list[club_index]]

    def parse(self, response):
        item = ClubTopicInfoScrapyItem()
        yield Request(url=response.url,
                      callback=self.parse_club_topic_info_items,
                      meta={"item": copy.deepcopy(item)},
                      dont_filter=True)

    def parse_club_topic_info_items(self, response):
        item = response.meta['item']
        content = json.loads(response.body.decode(), strict=False)
        result = content["result"]
        item["bbs_id"] = result["clubid"]
        item["row_count"] = result["rowcount"]
        item["friend_count"] = result["friendcount"]
        item["time"] = get_current_date()
        yield item
        self.club_index += 1
        if self.club_index < len(self.club_id_list):
            url = self.base_url % self.club_id_list[self.club_index]
            yield Request(url=url,
                          callback=self.parse_club_topic_info_items,
                          meta={"item": copy.deepcopy(item)},
                          dont_filter=True)
コード例 #2
0
 def parse_club_topic_read_items(self, response):
     item = response.meta['item']
     content = response.body.decode()
     content = re.search(r"{[^}]+}", content).group()
     content = json.loads(content, strict=False)
     item["topic_id"] = content["TopicId"]
     item["reply"] = content["Replys"]
     item["view"] = content["Views"]
     item["time"] = get_current_date()
     yield item
     self.topic_index += 1
     if self.topic_index < len(self.topic_id_list):
         url = self.base_url % self.topic_id_list[self.topic_index]
         yield Request(url=url,
                       callback=self.parse_club_topic_read_items,
                       meta={"item": copy.deepcopy(item)},
                       dont_filter=True)
     else:
         self.offset += 1000
         self.topic_id_list = StructureStartUrl().get_topic_id(self.offset)
         if 1000 >= len(self.topic_id_list) > 0:
             self.topic_index = 0
             print(self.topic_id_list[self.topic_index])
             url = self.base_url % self.topic_id_list[self.topic_index]
             yield Request(url=url,
                           callback=self.parse_club_topic_read_items,
                           meta={"item": copy.deepcopy(item)},
                           dont_filter=True)
コード例 #3
0
class AutoHomeCircleSpider(CrawlSpider):
    name = "club_circle"
    club_id_list = [6, 13, 15, 16]
    club_id_list = StructureStartUrl().get_bbs_id()
    club_index = 0
    base_url = "https://club.app.autohome.com.cn/club_v8.2.0/club/getcarfriendcirclelist-pm2-utd5a4a902aa6c4db1b5ca0adb2df71dda03f29a2a-b%s-p1-s20.json"
    start_urls = [base_url % club_id_list[club_index]]

    def parse(self, response):
        item = ClubCircleScrapyItem()
        yield Request(url=response.url,
                      callback=self.parse_club_circle_items,
                      meta={"item": copy.deepcopy(item)},
                      dont_filter=True)

    def parse_club_circle_items(self, response):
        item = response.meta['item']
        content = json.loads(response.body.decode(), strict=False)
        result = content["result"]
        item["bbs_id"] = self.club_id_list[self.club_index]
        item["row_count"] = result["rowcount"]
        item["time"] = get_current_date()
        yield item
        self.club_index += 1
        if self.club_index < len(self.club_id_list):
            url = self.base_url % self.club_id_list[self.club_index]
            yield Request(url=url,
                          callback=self.parse_club_circle_items,
                          meta={"item": copy.deepcopy(item)},
                          dont_filter=True)
コード例 #4
0
class AutoHomeClubActivityFriendDetailSpider(CrawlSpider):
    name = "club_activity_friend_detail"
    # club_id_list = [6, 13, 15, 16]
    club_id_list = StructureStartUrl().get_bbs_id()
    # club_id_list = [834]
    club_index = 0
    page_index = 1
    base_url = "https://club.app.autohome.com.cn/club_v8.2.0/club/getactivityfriendlist-pm2-b%s-t2-c0-u66230826-p%s-s20.json"
    start_urls = [base_url % (club_id_list[club_index], page_index)]

    def parse(self, response):
        item = ClubActivityFriendDetailScrapyItem()
        yield Request(url=response.url,
                      callback=self.parse_club_activity_friend_detail_items,
                      meta={"item": copy.deepcopy(item)})

    def parse_club_activity_friend_detail_items(self, response):
        item = response.meta['item']
        content = json.loads(response.body.decode(), strict=False)
        activity_friend_list = content["result"]["activityfriendlist"]  # 活跃车友
        club_master_list = content["result"]["clubmasterlist"]  # 推荐车友
        for club_master in club_master_list:
            item["bbs_id"] = self.club_id_list[self.club_index]
            item["user_id"] = club_master["userid"]
            item["recommend"] = 0
            item["time"] = get_current_date()
            yield item
        for activity_friend in activity_friend_list:
            item["bbs_id"] = self.club_id_list[self.club_index]
            item["user_id"] = activity_friend["userid"]
            item["recommend"] = 1
            item["time"] = get_current_date()
            yield item

        self.page_index += 1
        if self.page_index <= content["result"]["pagecount"]:
            print(self.page_index)
            url = self.base_url % (self.club_id_list[self.club_index],
                                   self.page_index)
            print(url)
            yield Request(
                url=url,
                callback=self.parse_club_activity_friend_detail_items,
                meta={"item": copy.deepcopy(item)},
                dont_filter=True)
        else:
            self.club_index += 1
            if self.club_index < len(self.club_id_list):
                self.page_index = 1
                url = self.base_url % (self.club_id_list[self.club_index],
                                       self.page_index)
                yield Request(
                    url=url,
                    callback=self.parse_club_activity_friend_detail_items,
                    meta={"item": copy.deepcopy(item)},
                    dont_filter=True)
コード例 #5
0
class AutoHomeTopicListSpider(CrawlSpider):
    name = "club_topic_list"
    # club_id_list = [6, 13, 15, 16]
    club_id_list = StructureStartUrl().get_bbs_id()
    club_index = 0
    page_index = 1
    base_url = "https://clubnc.app.autohome.com.cn/club_v8.2.0/club/topics-pm2-b%s-btc-r0-ss0-o0-p%s-s50-qf0-c110100-t0-v8.8.0.json"
    start_urls = [base_url % (club_id_list[club_index], page_index)]

    def parse(self, response):
        item = ClubTopicListScrapyItem()
        yield Request(url=response.url,
                      callback=self.parse_club_topic_list_items,
                      meta={"item": copy.deepcopy(item)})

    def parse_club_topic_list_items(self, response):
        item = response.meta['item']
        content = json.loads(response.body.decode(), strict=False)
        topic_list = content["result"]["list"]
        for topic in topic_list:
            item["topic_id"] = topic["topicid"]
            item["bbs_id"] = topic["bbsid"]
            item["title"] = topic["title"]
            item["user_id"] = topic["userid"]
            item["reply_counts"] = topic["replycounts"]
            item["post_topic_date"] = topic["posttopicdate"]
            item["last_reply_date"] = topic["lastreplydate"]
            item["topic_type"] = topic["topictype"]
            item["time"] = get_current_date()
            yield item
        self.page_index += 1
        if self.page_index <= content["result"]["pagecount"]:
            url = self.base_url % (self.club_id_list[self.club_index],
                                   self.page_index)
            yield Request(url=url,
                          callback=self.parse_club_topic_list_items,
                          meta={"item": copy.deepcopy(item)},
                          dont_filter=True)
        else:
            self.club_index += 1
            if self.club_index < len(self.club_id_list):
                self.page_index = 1
                url = self.base_url % (self.club_id_list[self.club_index],
                                       self.page_index)
                yield Request(url=url,
                              callback=self.parse_club_topic_list_items,
                              meta={"item": copy.deepcopy(item)},
                              dont_filter=True)
コード例 #6
0
    def parse_club_topic_comment_items(self, response):
        i = 0
        item = response.meta['item']
        soup = BeautifulSoup(response.body.decode())
        node_list = response.xpath("//*[@class=\"post-flow\"]/li")
        soup_node_list = soup.find_all('div', {'class': 'user-content'})
        for node in node_list:
            item["topic_id"] = self.club_id_list[self.club_index]
            try:
                item["user"] = node.xpath(".//span[@class=\"name\"]/a/text()").extract()[0]
            except Exception as e:
                item["user"] = ""
            try:
                item["content"] = soup_node_list[i].get_text().strip("\n")
            except Exception as e:
                item["content"] = ""
            try:
                item["publish_time"] = node.xpath('.//*[@class="time"]/text()').extract()[0]
            except Exception as e:
                item["publish_time"] = ""
            try:
                item["id"] = node.xpath('.//span[@class="flowLevel"]/span/text()').extract()[0]
            except Exception as e:
                item["id"] = ""
            if item["id"] != "":
                yield item
                i += 1

        self.page_index += 1
        if 0 < len(soup_node_list) <= 20:
            url = self.base_url % (self.club_id_list[self.club_index], self.page_index)
            yield Request(url=url, callback=self.parse_club_topic_comment_items, meta={"item": copy.deepcopy(item)})
        else:
            self.club_index += 1
            if self.club_index < len(self.club_id_list):
                self.page_index = 1
                url = self.base_url % (self.club_id_list[self.club_index], self.page_index)
                yield Request(url=url, callback=self.parse_club_topic_comment_items, meta={"item": copy.deepcopy(item)})
            else:
                self.offset += 1000
                self.club_id_list = StructureStartUrl().get_topic_id(self.offset)
                if 1000 >= len(self.club_id_list) > 0:
                    self.club_index = 0
                    self.page_index = 1
                    url = self.base_url % (self.club_id_list[self.club_index], self.page_index)
                    yield Request(url=url, callback=self.parse_club_topic_comment_items,
                                  meta={"item": copy.deepcopy(item)})
コード例 #7
0
class AutoHomeCircleDetailSpider(CrawlSpider):
    name = "club_circle_detail"
    # club_id_list = [6, 13, 15, 16]
    club_id_list = StructureStartUrl().get_bbs_id()
    # club_id_list = [834]
    club_index = 0
    page_index = 1
    base_url = "https://club.app.autohome.com.cn/club_v8.2.0/club/getcarfriendcirclelist-pm2-utd5a4a902aa6c4db1b5ca0adb2df71dda03f29a2a-b%s-p%s-s20.json"
    start_urls = [base_url % (club_id_list[club_index], page_index)]

    def parse(self, response):
        item = ClubCircleDetailScrapyItem()
        yield Request(url=response.url, callback=self.parse_club_circle_detail_items,
                      meta={"item": copy.deepcopy(item)}, dont_filter=True)

    def parse_club_circle_detail_items(self, response):
        item = response.meta['item']
        content = json.loads(response.body.decode(), strict=False)
        circle_list = content["result"]["list"]
        for circle in circle_list:
            item["bbs_id"] = circle["bbsid"]
            item["circle_id"] = circle["circleid"]
            item["circle_name"] = circle["circlename"]
            item["user_count"] = circle["usercount"]
            item["province_id"] = circle["provinceid"]
            item["city_id"] = circle["cityid"]
            item["explain"] = circle["explain"]
            item["activen_num"] = circle["activennum"]
            item["create_time"] = circle["createtime"]
            item["last_update_time"] = circle["lastupdatetime"]
            item["owner_id"] = circle["ownerid"]
            item["time"] = get_current_date()
            yield item

        self.page_index += 1
        if self.page_index <= content["result"]["pagecount"]:
            print(self.page_index)
            url = self.base_url % (self.club_id_list[self.club_index], self.page_index)
            print(url)
            yield Request(url=url, callback=self.parse_club_circle_detail_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
        else:
            self.club_index += 1
            if self.club_index < len(self.club_id_list):
                self.page_index = 1
                url = self.base_url % (self.club_id_list[self.club_index], self.page_index)
                yield Request(url=url, callback=self.parse_club_circle_detail_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
コード例 #8
0
class AutoHomeTopicReadSpider(CrawlSpider):
    """帖子的阅读数"""
    name = "club_topic_read"
    # topic_id_list = [1570, 2715, 3246]
    offset = 0
    topic_id_list = StructureStartUrl().get_topic_id(offset)
    topic_index = 0
    base_url = "https://forum.app.autohome.com.cn/forum_v7.9.5/forum/club/topicaddclicksajax?topicid=%s"
    start_urls = [base_url % topic_id_list[topic_index]]

    def parse(self, response):
        item = ClubTopicReadScrapyItem()
        yield Request(url=response.url,
                      callback=self.parse_club_topic_read_items,
                      meta={"item": copy.deepcopy(item)},
                      dont_filter=True)

    def parse_club_topic_read_items(self, response):
        item = response.meta['item']
        content = response.body.decode()
        content = re.search(r"{[^}]+}", content).group()
        content = json.loads(content, strict=False)
        item["topic_id"] = content["TopicId"]
        item["reply"] = content["Replys"]
        item["view"] = content["Views"]
        item["time"] = get_current_date()
        yield item
        self.topic_index += 1
        if self.topic_index < len(self.topic_id_list):
            url = self.base_url % self.topic_id_list[self.topic_index]
            yield Request(url=url,
                          callback=self.parse_club_topic_read_items,
                          meta={"item": copy.deepcopy(item)},
                          dont_filter=True)
        else:
            self.offset += 1000
            self.topic_id_list = StructureStartUrl().get_topic_id(self.offset)
            if 1000 >= len(self.topic_id_list) > 0:
                self.topic_index = 0
                print(self.topic_id_list[self.topic_index])
                url = self.base_url % self.topic_id_list[self.topic_index]
                yield Request(url=url,
                              callback=self.parse_club_topic_read_items,
                              meta={"item": copy.deepcopy(item)},
                              dont_filter=True)
コード例 #9
0
class AutoHomeClubTopicListSpider(scrapy.Spider):
    name = 'auto_home_club_topic_list'
    club_id_list = StructureStartUrl().get_bbs_id()
    # club_id_list = [4744]
    club_index = 0
    page_index = 1
    base_url = "https://clubnc.app.autohome.com.cn/club_v8.2.0/club/topics-pm2-b%s-btc-r0-ss0-o0-p%s-s50-qf0-c110100-t0-v8.8.0.json"
    start_urls = [base_url % (club_id_list[club_index], page_index)]

    def parse(self, response):
        item = ClubTopicListItem()
        content = json.loads(response.body.decode(), strict=False)
        topic_list = content["result"]["list"]
        for topic in topic_list:
            item["topic_id"] = topic["topicid"]
            item["bbs_id"] = topic["bbsid"]
            item["title"] = topic["title"]
            item["user_id"] = topic["userid"]
            item["reply_counts"] = topic["replycounts"]
            post_topic_date = topic["posttopicdate"]
            if "前" in post_topic_date:
                item["post_topic_date"] = formmat_time(post_topic_date)
            else:
                item["post_topic_date"] = post_topic_date
            last_reply_date = topic["lastreplydate"]
            if "前" in post_topic_date:
                item["last_reply_date"] = formmat_time(last_reply_date)
            else:
                item["last_reply_date"] = last_reply_date
            item["topic_type"] = topic["topictype"]
            item["time"] = get_current_date()
            yield item
        self.page_index += 1
        if self.page_index <= content["result"]["pagecount"]:
            url = self.base_url % (self.club_id_list[self.club_index], self.page_index)
            yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
        else:
            self.club_index += 1
            percent = self.club_index / len(self.club_id_list)
            sys.stdout.write("\r" + "抓取进度:%d%%(%d/%d)" % (percent * 100, self.club_index, len(self.club_id_list)))
            if self.club_index < len(self.club_id_list):
                self.page_index = 1
                url = self.base_url % (self.club_id_list[self.club_index], self.page_index)
                yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
コード例 #10
0
class AutoHomeClubActivityFriendsSpider(scrapy.Spider):
    name = 'auto_home_club_activity_friends'
    club_id_list = StructureStartUrl().get_bbs_id()
    club_index = 0
    base_url = "https://club.app.autohome.com.cn/club_v8.2.0/club/getactivityfriendlist-pm2-b%s-t2-c0-u66230826-p1-s20.json"
    start_urls = [base_url % club_id_list[club_index]]

    def parse(self, response):
        item = ClubActivityFriendsItem()
        content = json.loads(response.body.decode(), strict=False)
        result = content["result"]
        item["bbs_id"] = self.club_id_list[self.club_index]
        item["activity_friend_count"] = result["activityfriendcount"]
        item["time"] = get_current_date()
        yield item
        self.club_index += 1
        if self.club_index < len(self.club_id_list):
            url = self.base_url % self.club_id_list[self.club_index]
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 dont_filter=True)
コード例 #11
0
class AutoHomeClubCircleSpider(scrapy.Spider):
    name = 'auto_home_club_circle'
    club_id_list = StructureStartUrl().get_bbs_id()
    club_index = 0
    page_index = 1
    # base_url = "https://club.app.autohome.com.cn/club_v8.2.0/club/getcarfriendcirclelist-pm2-utd5a4a902aa6c4db1b5ca0adb2df71dda03f29a2a-b%s-p1-s20.json"
    base_url = "https://chat.api.autohome.com.cn/c1/s1/api/getSeriesProvinceTagCyqList?cyqType=1&convertId=%s&memberId=0&pageIndex=%s&pageSize=15&_appid=club.pc"
    start_urls = [base_url % (club_id_list[club_index], page_index)]

    def parse(self, response):
        item = ClubCircleItem()
        content = json.loads(response.body.decode(), strict=False)
        result = content["result"]
        item["bbs_id"] = self.club_id_list[self.club_index]
        item["row_count"] = result["rowCount"]
        details = result["list"]
        pageCount = result["pageCount"]
        for detail in details:
            item["targetId"] = detail["targetId"]  # 车友圈ID
            item["seriesId"] = detail["seriesId"]  # 车系ID
            item["score"] = detail["score"]  # 人气
            item["title"] = detail["title"]  # 车友圈
            item["explain"] = detail["explain"]  # 介绍
            item["memberCount"] = detail["memberCount"]  # 成员数量
            item["time"] = get_current_date()
            yield item
        self.page_index += 1
        if self.page_index < pageCount:
            url = self.base_url % (self.club_id_list[self.club_index],
                                   self.page_index)
            yield scrapy.Request(url, callback=self.parse)
        else:
            self.club_index += 1
            if self.club_index < len(self.club_id_list):
                self.page_index = 1
                url = self.base_url % (self.club_id_list[self.club_index],
                                       self.page_index)
                yield scrapy.Request(url=url,
                                     callback=self.parse,
                                     dont_filter=True)
コード例 #12
0
 def __init__(self):
     self.index = 1
     self.base_url = "https://forum.app.autohome.com.cn/forum_v7.9.5/forum/club/topiccontent-a2-pm2-v8.8.0-t%s-o0-p1-s20-c1-nt0-fs0-sp0-al0-cw360-i0-ct1.json"
     self.topic_list = StructureStartUrl().get_topic_id()
コード例 #13
0
class AutoHomeClubTopicCommentsSpider(scrapy.Spider):
    name = 'auto_home_club_topic_comments'
    club_index = 0
    page_index = 1

    club_id_list = StructureStartUrl().get_topic_id()
    base_url = "https://forum.app.autohome.com.cn/forum_v7.9.5/forum/club/topiccontent-a2-pm2-v8.8.0-t%s-o0-p%s-s20-c1-nt0-fs0-sp0-al0-cw360-i0-ct1.json"
    start_urls = [base_url % (club_id_list[club_index], page_index)]

    def parse(self, response):
        item = ClubTopicCommentsItem()
        i = 0
        soup = BeautifulSoup(response.body.decode('utf-8'))
        node_list = response.xpath("//*[@class=\"post-flow\"]/li")

        soup_node_list = soup.find_all('div', {'class': 'user-content'})
        for node in node_list:
            rule = get_rule(response.text)
            item["topic_id"] = self.club_id_list[self.club_index]
            try:
                item["user"] = node.xpath(
                    ".//span[@class=\"name\"]/a/text()").extract()[0]
            except Exception as e:
                item["user"] = ""
            try:
                item["user_id"] = node.xpath(
                    './/span[@class="name"]/a/@href').get().split("㊣")[1]
            except Exception as e:
                item["user_id"] = ""
            try:
                span_list = soup_node_list[i].find_all('span')
                for span in span_list:
                    span.append(rule[span["class"][0]])
                item["content"] = soup_node_list[i].get_text().strip("\n")
            except Exception as e:
                item["content"] = "该帖已删除"
            try:
                item["publish_time"] = node.xpath(
                    './/*[@class="time"]/text()').extract()[0]
            except Exception as e:
                item["publish_time"] = ""
            try:
                item["id"] = node.xpath(
                    './/span[@class="flowLevel"]/span/text()').extract()[0]
            except Exception as e:
                item["id"] = ""
            if item["id"] != "":
                yield item
                i += 1
        self.page_index += 1
        if 0 < len(soup_node_list) <= 20:
            url = self.base_url % (self.club_id_list[self.club_index],
                                   self.page_index)
            yield scrapy.Request(url=url, callback=self.parse)
        else:
            self.club_index += 1
            if self.club_index < len(self.club_id_list):
                self.page_index = 1
                url = self.base_url % (self.club_id_list[self.club_index],
                                       self.page_index)
                yield scrapy.Request(url=url, callback=self.parse)