Esempio n. 1
0
    def parse(self, response):
        for group in response.data['groups']:
            group_id = group['group_id']

            if group_id in settings.IGNORE_GROUP_ID:
                continue
            yield GroupItem(_id=group_id, data=group)

            # 最新话题
            yield scrapy.Request(XmqApi.URL_TOPICS(group_id),
                                 callback=self.parse_topic)
Esempio n. 2
0
    def parse_topic(self, response):
        topics = response.data['topics']

        for topic in topics:
            topic_id, group_name = topic['topic_id'], topic['group']['name']
            yield TopicItem(_id=topic_id, data=topic, group_name=group_name)

            if topic['type'] == 'talk':

                # 图片
                images = topic['talk'].get('images')
                if images:
                    image_urls = map(XmqApi.get_image_url, images)
                    yield TopicImagesItem(_id=topic_id,
                                          data=images,
                                          group_name=group_name,
                                          image_urls=image_urls)

                # 文件
                files = topic['talk'].get('files')
                if files:
                    item = TopicFilesItem(_id=topic_id,
                                          data=files,
                                          group_name=group_name,
                                          file_urls=list())
                    url = XmqApi.URL_FILE_DOWNLOAD(files[0]['file_id'])
                    yield scrapy.Request(url,
                                         callback=self.parse_file,
                                         meta={
                                             'item': item,
                                             'i': 1
                                         })

        # 下一批话题
        if topics:
            last_topic = topics[-1]
            url = XmqApi.URL_TOPICS(last_topic['group']['group_id'],
                                    last_topic['create_time'])
            yield scrapy.Request(url, callback=self.parse_topic)