def parse(self, response):
        item = ClubActivityFriendsDetailsItem()
        content = json.loads(response.body.decode(), strict=False)
        activity_friend_list = content["result"]["activityfriendlist"]  # 活跃车友
        club_master_list = content["result"]["clubmasterlist"]  # 推荐车友
        for club_master in club_master_list:
            item["bbs_id"] = self.club_id_list[self.club_index]
            item["user_id"] = club_master["userid"]
            item["recommend"] = 0
            item["time"] = get_current_date()
            yield item
        for activity_friend in activity_friend_list:
            item["bbs_id"] = self.club_id_list[self.club_index]
            item["user_id"] = activity_friend["userid"]
            item["recommend"] = 1
            item["time"] = get_current_date()
            yield item

        self.page_index += 1
        if self.page_index <= content["result"]["pagecount"]:
            print(self.page_index)
            url = self.base_url % (self.club_id_list[self.club_index], self.page_index)
            print(url)
            yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
        else:
            self.club_index += 1
            if self.club_index < len(self.club_id_list):
                self.page_index = 1
                url = self.base_url % (self.club_id_list[self.club_index], self.page_index)
                yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
    def parse_club_circle_detail_items(self, response):
        item = response.meta['item']
        content = json.loads(response.body.decode(), strict=False)
        circle_list = content["result"]["list"]
        for circle in circle_list:
            item["bbs_id"] = circle["bbsid"]
            item["circle_id"] = circle["circleid"]
            item["circle_name"] = circle["circlename"]
            item["user_count"] = circle["usercount"]
            item["province_id"] = circle["provinceid"]
            item["city_id"] = circle["cityid"]
            item["explain"] = circle["explain"]
            item["activen_num"] = circle["activennum"]
            item["create_time"] = circle["createtime"]
            item["last_update_time"] = circle["lastupdatetime"]
            item["owner_id"] = circle["ownerid"]
            item["time"] = get_current_date()
            yield item

        self.page_index += 1
        if self.page_index <= content["result"]["pagecount"]:
            print(self.page_index)
            url = self.base_url % (self.club_id_list[self.club_index], self.page_index)
            print(url)
            yield Request(url=url, callback=self.parse_club_circle_detail_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
        else:
            self.club_index += 1
            if self.club_index < len(self.club_id_list):
                self.page_index = 1
                url = self.base_url % (self.club_id_list[self.club_index], self.page_index)
                yield Request(url=url, callback=self.parse_club_circle_detail_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
Esempio n. 3
0
 def parse(self, response):
     item = ClubCircleItem()
     content = json.loads(response.body.decode(), strict=False)
     result = content["result"]
     item["bbs_id"] = self.club_id_list[self.club_index]
     item["row_count"] = result["rowCount"]
     details = result["list"]
     pageCount = result["pageCount"]
     for detail in details:
         item["targetId"] = detail["targetId"]  # 车友圈ID
         item["seriesId"] = detail["seriesId"]  # 车系ID
         item["score"] = detail["score"]  # 人气
         item["title"] = detail["title"]  # 车友圈
         item["explain"] = detail["explain"]  # 介绍
         item["memberCount"] = detail["memberCount"]  # 成员数量
         item["time"] = get_current_date()
         yield item
     self.page_index += 1
     if self.page_index < pageCount:
         url = self.base_url % (self.club_id_list[self.club_index],
                                self.page_index)
         yield scrapy.Request(url, callback=self.parse)
     else:
         self.club_index += 1
         if self.club_index < len(self.club_id_list):
             self.page_index = 1
             url = self.base_url % (self.club_id_list[self.club_index],
                                    self.page_index)
             yield scrapy.Request(url=url,
                                  callback=self.parse,
                                  dont_filter=True)
Esempio n. 4
0
 def parse_article_comment_items(self, response):
     item = response.meta['item']
     content = json.loads(response.body.decode(), strict=False)
     comment_list = content["result"]["list"]
     for comment in comment_list:
         item["article_id"] = self.article_list[self.article_index][0]
         item["comment_id"] = comment["id"]
         item["floor"] = comment["floor"]
         item["user_id"] = comment["nameid"]
         item["publish_time"] = comment["time"]
         item["content"] = comment["content"]
         item["update_time"] = get_current_date()
         yield item
     self.last_time = item["comment_id"]
     self.page_index += 1
     if math.ceil(content["result"]["totalcount"] / 20) >= self.page_index:
         url = self.base_url % (self.article_list[self.article_index][0],
                                self.last_time)
         yield Request(url=url,
                       callback=self.parse_article_comment_items,
                       meta={"item": copy.deepcopy(item)},
                       dont_filter=True)
     else:
         self.article_index += 1
         if self.article_index < len(self.article_list):
             self.page_index = 1
             self.last_time = 0
             url = self.base_url % (
                 self.article_list[self.article_index][0], self.last_time)
             print(url)
             yield Request(url=url,
                           callback=self.parse_article_comment_items,
                           meta={"item": copy.deepcopy(item)},
                           dont_filter=True)
Esempio n. 5
0
    def parse_article_list_items(self, response):
        item = response.meta['item']
        content = json.loads(response.body.decode(), strict=False)
        news_list = content["result"]["newslist"]
        item["last_time"] = content["result"]["pageid"]
        for news in news_list:
            news = news["data"]
            item["id"] = news["id"]
            item["title"] = news["title"]
            item["media_type"] = news["mediatype"]
            item["type"] = news["type"]
            item["publish_time"] = news["updatetime"]
            item["author"] = news["thirdsource"]
            try:
                item["ver"] = news["ver"]
            except Exception as e:
                item["ver"] = ""
            item["update_time"] = get_current_date()

            yield item
        self.last_time = item["last_time"]
        if len(news_list) == 30:
            url = self.base_url % (self.last_time, self.type_list[self.type_index])
            yield Request(url=url, callback=self.parse_article_list_items, meta={"item": copy.deepcopy(item)},
                          dont_filter=True)
        else:
            self.type_index += 1
            if self.type_index < len(self.type_list):
                self.last_time = 0
                url = self.base_url % (self.last_time, self.type_list[self.type_index])
                yield Request(url=url, callback=self.parse_article_list_items, meta={"item": copy.deepcopy(item)},
                              dont_filter=True)
Esempio n. 6
0
 def parse(self, response):
     item = KoubeiCommentItem()
     content = json.loads(response.body.decode())
     result = content["result"]
     item["koubei_id"] = self.comment_list[self.comment_index]
     if len(result["list"]) > 0:
         for comment_list in result["list"]:
             item["id"] = comment_list["id"]
             item["user_id"] = comment_list["nameid"]
             item["content"] = comment_list["content"]
             item["carname"] = comment_list["carname"]
             create_time = comment_list["time"]
             if "前" in create_time:
                 item["create_time"] = formmat_time(create_time)
             else:
                 item["create_time"] = comment_list["time"]
             item["time"] = get_current_date()
             self.comment_id_list.append(str(comment_list["id"]))
             yield item
     if len(result["list"]) == 20 and (result["pageid"] == self.comment_id_list[-1]):
         url = self.base_url % (self.comment_list[self.comment_index], self.comment_id_list[-1])
         yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
     else:
         self.comment_index += 1
         if self.comment_index < len(self.comment_list):
             self.last_id = 0
             url = self.base_url % (self.comment_list[self.comment_index], self.last_id)
             yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
Esempio n. 7
0
 def parse_koubei_comment_item(self, response):
     item = response.meta['item']
     content = json.loads(response.body.decode())
     result = content["result"]
     item["koubei_id"] = self.comment_list[self.comment_index]
     if len(result["list"]) > 0:
         for comment_list in result["list"]:
             item["id"] = comment_list["id"]
             item["user_id"] = comment_list["nameid"]
             item["content"] = comment_list["content"]
             item["carname"] = comment_list["carname"]
             item["create_time"] = comment_list["time"]
             item["time"] = get_current_date()
             self.comment_id_list.append(str(comment_list["id"]))
             yield item
     if len(result["list"]) == 20 and (result["pageid"]
                                       == self.comment_id_list[-1]):
         url = self.base_url % (self.comment_list[self.comment_index],
                                self.comment_id_list[-1])
         yield Request(url=url,
                       callback=self.parse_koubei_comment_item,
                       meta={"item": copy.deepcopy(item)},
                       dont_filter=True)
     else:
         self.comment_index += 1
         if self.comment_index < len(self.comment_list):
             self.last_id = 0
             url = self.base_url % (self.comment_list[self.comment_index],
                                    self.last_id)
             yield Request(url=url,
                           callback=self.parse_koubei_comment_item,
                           meta={"item": copy.deepcopy(item)},
                           dont_filter=True)
Esempio n. 8
0
 def parse_club_topic_read_items(self, response):
     item = response.meta['item']
     content = response.body.decode()
     content = re.search(r"{[^}]+}", content).group()
     content = json.loads(content, strict=False)
     item["topic_id"] = content["TopicId"]
     item["reply"] = content["Replys"]
     item["view"] = content["Views"]
     item["time"] = get_current_date()
     yield item
     self.topic_index += 1
     if self.topic_index < len(self.topic_id_list):
         url = self.base_url % self.topic_id_list[self.topic_index]
         yield Request(url=url,
                       callback=self.parse_club_topic_read_items,
                       meta={"item": copy.deepcopy(item)},
                       dont_filter=True)
     else:
         self.offset += 1000
         self.topic_id_list = StructureStartUrl().get_topic_id(self.offset)
         if 1000 >= len(self.topic_id_list) > 0:
             self.topic_index = 0
             print(self.topic_id_list[self.topic_index])
             url = self.base_url % self.topic_id_list[self.topic_index]
             yield Request(url=url,
                           callback=self.parse_club_topic_read_items,
                           meta={"item": copy.deepcopy(item)},
                           dont_filter=True)
Esempio n. 9
0
 def parse(self, response):
     item = ClubTopicListItem()
     content = json.loads(response.body.decode(), strict=False)
     topic_list = content["result"]["list"]
     for topic in topic_list:
         item["topic_id"] = topic["topicid"]
         item["bbs_id"] = topic["bbsid"]
         item["title"] = topic["title"]
         item["user_id"] = topic["userid"]
         item["reply_counts"] = topic["replycounts"]
         post_topic_date = topic["posttopicdate"]
         if "前" in post_topic_date:
             item["post_topic_date"] = formmat_time(post_topic_date)
         else:
             item["post_topic_date"] = post_topic_date
         last_reply_date = topic["lastreplydate"]
         if "前" in post_topic_date:
             item["last_reply_date"] = formmat_time(last_reply_date)
         else:
             item["last_reply_date"] = last_reply_date
         item["topic_type"] = topic["topictype"]
         item["time"] = get_current_date()
         yield item
     self.page_index += 1
     if self.page_index <= content["result"]["pagecount"]:
         url = self.base_url % (self.club_id_list[self.club_index], self.page_index)
         yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
     else:
         self.club_index += 1
         percent = self.club_index / len(self.club_id_list)
         sys.stdout.write("\r" + "抓取进度:%d%%(%d/%d)" % (percent * 100, self.club_index, len(self.club_id_list)))
         if self.club_index < len(self.club_id_list):
             self.page_index = 1
             url = self.base_url % (self.club_id_list[self.club_index], self.page_index)
             yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
Esempio n. 10
0
 def parse_club_topic_list_items(self, response):
     item = response.meta['item']
     content = json.loads(response.body.decode(), strict=False)
     topic_list = content["result"]["list"]
     for topic in topic_list:
         item["topic_id"] = topic["topicid"]
         item["bbs_id"] = topic["bbsid"]
         item["title"] = topic["title"]
         item["user_id"] = topic["userid"]
         item["reply_counts"] = topic["replycounts"]
         item["post_topic_date"] = topic["posttopicdate"]
         item["last_reply_date"] = topic["lastreplydate"]
         item["topic_type"] = topic["topictype"]
         item["time"] = get_current_date()
         yield item
     self.page_index += 1
     if self.page_index <= content["result"]["pagecount"]:
         url = self.base_url % (self.club_id_list[self.club_index],
                                self.page_index)
         yield Request(url=url,
                       callback=self.parse_club_topic_list_items,
                       meta={"item": copy.deepcopy(item)},
                       dont_filter=True)
     else:
         self.club_index += 1
         if self.club_index < len(self.club_id_list):
             self.page_index = 1
             url = self.base_url % (self.club_id_list[self.club_index],
                                    self.page_index)
             yield Request(url=url,
                           callback=self.parse_club_topic_list_items,
                           meta={"item": copy.deepcopy(item)},
                           dont_filter=True)
Esempio n. 11
0
 def parse_spec_item(self, response):
     item = response.meta['item']
     content = json.loads(response.body.decode())
     for element in content:
         s_list = element["slist"]
         for spec in s_list:
             item["spec_id"] = spec["SpecId"]
             item["price"] = spec["Price"]
             item["time"] = get_current_date()
             yield item
Esempio n. 12
0
 def parse_user_cars_item(self, response):
     item = response.meta["item"]
     content = json.loads(response.body.decode('gbk'))
     concern_info_list = content["ConcernInfoList"]
     for concern in concern_info_list:
         item["spec_id"] = concern["SpecId"]
         item["cert_date"] = timestamp_to_localtime(
             int(concern["datetime"].rstrip("/)").lstrip("/Date(")))
         item["time"] = get_current_date()
         yield item
Esempio n. 13
0
 def parse_article_pv_items(self, response):
     item = response.meta['item']
     content = re.search(r"{[^}]+}][^}]+}", response.body.decode()).group()
     content = json.loads(content, strict=False)
     result = content["result"][0]
     item["article_id"] = result["id"]
     item["pv_count"] = result["pvcount"]
     item["update_time"] = get_current_date()
     yield item
     self.article_index += 1
     if len(self.article_list) > self.article_index:
         url = self.base_url % (self.article_list[self.article_index][0])
         yield Request(url=url, callback=self.parse_article_pv_items, meta={"item": copy.deepcopy(item)},
                       dont_filter=True)
 def parse(self, response):
     item = ClubActivityFriendsItem()
     content = json.loads(response.body.decode(), strict=False)
     result = content["result"]
     item["bbs_id"] = self.club_id_list[self.club_index]
     item["activity_friend_count"] = result["activityfriendcount"]
     item["time"] = get_current_date()
     yield item
     self.club_index += 1
     if self.club_index < len(self.club_id_list):
         url = self.base_url % self.club_id_list[self.club_index]
         yield scrapy.Request(url=url,
                              callback=self.parse,
                              dont_filter=True)
Esempio n. 15
0
 def parse_club_circle_items(self, response):
     item = response.meta['item']
     content = json.loads(response.body.decode(), strict=False)
     result = content["result"]
     item["bbs_id"] = self.club_id_list[self.club_index]
     item["row_count"] = result["rowcount"]
     item["time"] = get_current_date()
     yield item
     self.club_index += 1
     if self.club_index < len(self.club_id_list):
         url = self.base_url % self.club_id_list[self.club_index]
         yield Request(url=url,
                       callback=self.parse_club_circle_items,
                       meta={"item": copy.deepcopy(item)},
                       dont_filter=True)
Esempio n. 16
0
    def parse(self, response):
        item = KoubeiReadItem()
        content = json.loads(response.body.decode())
        item["koubei_id"] = content["result"]["eid"]
        node = content["result"]
        item["visit_count"] = node["visitcount"]
        item["helpful_count"] = node["helpfulcount"]
        item["comment_count"] = node["commentcount"]
        item["time"] = get_current_date()
        yield item

        self.koubei_index += 1
        if self.koubei_index < len(self.koubei_list):
            url = self.base_url % (self.koubei_list[self.koubei_index])
            yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
Esempio n. 17
0
 def parse_dealer(self, response):
     content = json.loads(response.body.decode())
     item = response.meta['item']
     item["company"] = content["Company"]
     item["company_simple"] = content["CompanySimple"]
     item["address"] = content["Address"]
     item["pid"] = content["PID"]
     item["cid"] = content["CID"]
     item["sid"] = content["SID"]
     item["business_area"] = content["OrderRangeTitle"]
     item["lon"] = content["MapLonBaidu"]
     item["lat"] = content["MapLatBaidu"]
     item["kind_id"] = content["KindID"]
     item["star_level"] = content["StarLevel"]
     item["update_time"] = get_current_date()
     return item
Esempio n. 18
0
 def parse(self, response):
     result = json.loads(response.body.decode())["result"]
     item = KoubeiRankScrapyItem()
     for series in result["serieslist"]:
         item["level_id"] = result["categoryid"]
         item["series_id"] = series["seriesid"]
         item["koubei_rank"] = series["rank"]
         item["koubei_score"] = series["score"]
         item["koubei_evaluation_count"] = series["evaluationcount"]
         item["koubei_update_time"] = get_current_date()
         yield item
         time.sleep(1)
     self.url_index += 1
     if self.url_index < len(self.level_list):
         url = self.base_url % self.level_list[self.url_index]
         yield Request(url=url, callback=self.parse)
Esempio n. 19
0
    def parse_koubei_article_item(self, response):
        item = response.meta['item']
        content = json.loads(response.body.decode())
        item["koubei_id"] = content["result"]["eid"]
        node = content["result"]
        item["visit_count"] = node["visitcount"]
        item["helpful_count"] = node["helpfulcount"]
        item["comment_count"] = node["commentcount"]
        item["time"] = get_current_date()
        yield item

        self.koubei_index += 1
        if self.koubei_index < len(self.koubei_list):
            url = self.base_url % (self.koubei_list[self.koubei_index])
            yield Request(url=url,
                          callback=self.parse_koubei_article_item,
                          meta={"item": copy.deepcopy(item)},
                          dont_filter=True)
Esempio n. 20
0
    def parse(self, response):
        item = KoubeiTagNumItem()
        content = json.loads(response.body.decode())
        result = content["result"]
        if len(result["structuredlist"]) > 0:
            structure = result["structuredlist"][0]
            for summary in structure["Summary"]:
                if summary["SummaryKey"] != 0:
                    item["summary_key"] = summary["SummaryKey"]
                    item["volume"] = summary["Volume"]
                    item["time"] = get_current_date()
                    yield item

        # 翻页操作
        self.series_index += 1
        if self.series_index < len(self.series_list):
            url = self.base_url % self.series_list[self.series_index][0]
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 dont_filter=True)
Esempio n. 21
0
    def parse(self, response):
        item = KoubeiArticleItem()
        content = json.loads(response.body.decode())
        item["koubei_id"] = content["result"]["eid"]
        node_value = content["result"]
        for node in node_value:
            if re.search(r"[a-zA-Z]+Scene", node):
                item["feeling_name"] = node_value[node]["feelingname"]
                item["feeling"] = node_value[node]["feeling"]
                item["score"] = node_value[node]["score"]
                item["time"] = get_current_date()
                yield item

        # 翻页操作
        self.koubei_index += 1
        if self.koubei_index < len(self.koubei_list):
            url = self.base_url % (self.koubei_list[self.koubei_index])
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 dont_filter=True)
Esempio n. 22
0
    def parse_tag_item(self, response):
        item = response.meta["item"]
        content = json.loads(response.body.decode())
        result = content["result"]
        if len(result["structuredlist"]) > 0:
            structure = result["structuredlist"][0]
            for summary in structure["Summary"]:
                if summary["SummaryKey"] != 0:
                    item["summary_key"] = summary["SummaryKey"]
                    item["volume"] = summary["Volume"]
                    item["time"] = get_current_date()
                    yield item

        # 翻页操作
        self.series_index += 1
        if self.series_index < len(self.series_list):
            url = self.base_url % self.series_list[self.series_index][0]
            yield Request(url=url,
                          callback=self.parse_tag_item,
                          meta={'item': copy.deepcopy(item)},
                          dont_filter=True)