Esempio n. 1
0
    def parse(self, response):  #forum parser
        for sel in response.xpath('//li[contains(@class, "j_thread_list")]'):
            data = json.loads(sel.xpath('@data-field').extract_first())
            item = ThreadItem()
            item['id'] = data['id']
            item['author'] = data['author_name']
            item['reply_num'] = data['reply_num']
            item['good'] = data['is_good']
            if not item['good']:
                item['good'] = False
            item['title'] = sel.xpath(
                './/div[contains(@class, "threadlist_title")]/a/text()'
            ).extract_first()
            if self.filter and not self.filter(
                    item["id"], item["title"], item['author'],
                    item['reply_num'], item['good']):
                continue
            #filter过滤掉的帖子及其回复均不存入数据库

            yield item
            meta = {'thread_id': data['id'], 'page': 1}
            url = 'http://tieba.baidu.com/p/%d' % data['id']
            if self.see_lz:
                url += '?see_lz=1'
            yield scrapy.Request(url, callback=self.parse_post, meta=meta)
        next_page = response.xpath('//a[@class="next pagination-item "]/@href')
        self.cur_page += 1
        if next_page:
            if self.cur_page <= self.end_page:
                yield self.make_requests_from_url('http:' +
                                                  next_page.extract_first())
Esempio n. 2
0
    def parse(self, response):  #forum parser
        print("Crawling page %d..." % self.cur_page)
        for sel in response.xpath('//li[contains(@class, "j_thread_list")]'):
            data = json.loads(sel.xpath('@data-field').extract_first())
            item = ThreadItem()
            item['id'] = data['id']
            item['author'] = data['author_name']
            item['reply_num'] = data['reply_num']
            item['good'] = data['is_good']
            if not item['good']:
                item['good'] = False
            item['title'] = sel.xpath(
                './/div[contains(@class, "threadlist_title")]/a/@title'
            ).extract_first()
            if self.filter and not self.filter(
                    item["id"], item["title"], item['author'],
                    item['reply_num'], item['good']):
                continue
            #filter过滤掉的帖子及其回复均不存入数据库

            yield item
            meta = {'thread_id': data['id'], 'page': 1}
            url = 'http://tieba.baidu.com/p/%d' % data['id']
            if self.see_lz:
                url += '?see_lz=1'
            yield scrapy.Request(
                url,
                callback=self.parse_post,
                meta=meta,
                headers={
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
                })
        next_page = response.xpath('//a[@class="next pagination-item "]/@href')
        self.cur_page += 1
        if next_page:
            if self.cur_page <= self.end_page:
                yield self.make_requests_from_url('http:' +
                                                  next_page.extract_first())
Esempio n. 3
0
 def parse(self, response):  #forum parser
     for sel in response.xpath('//li[contains(@class, "j_thread_list")]'):
         data = json.loads(sel.xpath('@data-field').extract_first())
         item = ThreadItem()
         item['id'] = data['id']
         item['author'] = data['author_name']
         item['reply_num'] = data['reply_num']
         item['good'] = data['is_good']
         if not item['good']:
             item['good'] = False
         item['title'] = sel.xpath(
             './/div[contains(@class, "threadlist_title")]/a/text()'
         ).extract_first()
         yield item
         meta = {'thread_id': data['id'], 'page': 1}
         url = 'http://tieba.baidu.com/p/%d' % data['id']
         yield scrapy.Request(url, callback=self.parse_post, meta=meta)
     next_page = response.xpath('//a[@class="next pagination-item "]/@href')
     if next_page:
         self.max_page -= 1
         if self.max_page > 0:
             yield self.make_requests_from_url(next_page.extract_first())
Esempio n. 4
0
    def parse(self, response):  #forum parser

        pre_titleName = response.xpath('//head/title//text()').extract_first()
        print("head中的贴吧名:", pre_titleName)
        titleName = pre_titleName.split('-')[0]
        #贴吧name就是贴吧id
        titleId = pre_titleName.split('-')[0]
        #关注数
        card_menNum = response.xpath(
            '//span[contains(@class, "card_numLabel")]')
        #发帖数
        card_infoNum = response.xpath(
            '//span[contains(@class, "card_infoNum")]')
        print("关注数:{}".format(card_menNum))
        print("发帖数:{}".format(card_infoNum))
        tiebaInfo = TiebaInfo()
        #贴吧id 暂时跳转都是fw=贴吧名
        tiebaInfo['outId'] = titleId
        #贴吧名和贴吧id是一致的
        tiebaInfo['tiebaName'] = titleName
        #关注数
        tiebaInfo['accountCount'] = card_menNum
        #帖子数
        tiebaInfo['postCount'] = card_infoNum

        manager_groups = response.xpath(
            '//ul[contains(@class, "manager_groups aside_media_horizontal")]')
        print("管理员列表:{}".format(manager_groups))

        noreferrer_name_list = manager_groups.xpath(
            '//a[@rel="noreferrer"]/@title').extract()
        print("吧主名字集合数据:{}".format(noreferrer_name_list))
        noreferrer_name_str = ",".join(noreferrer_name_list)
        #暂时先获取对应的名字
        tiebaInfo["managerIds"] = noreferrer_name_str
        yield tiebaInfo

        thread_list = response.xpath('//li[contains(@class, "j_thread_list")]')

        thread_author_list = response.xpath(
            '//span[contains(@class, "tb_icon_author")]')
        #这个时间不行格式有问题
        # thread_create_time_list = response.xpath('//span[contains(@class, "is_show_create_time")]')

        for index, sel in enumerate(thread_list):
            data = json.loads(sel.xpath('@data-field').extract_first())

            user_data = thread_author_list.xpath(
                '@data-field').extract()[index]

            # create_time = thread_create_time_list.extract()[index]

            print("用户id信息:{}".format(user_data))
            print("用户名字:{}".format(data['author_name']))
            print("用户id的data类型:{}".format(type(user_data)))

            author_data = json.loads(user_data)
            item = ThreadItem()
            # 贴吧账号id
            item['tiebaAccountId'] = author_data['user_id']
            #外部内容id 帖子id
            item['outContentId'] = data['id']
            # 贴吧名就是贴吧id
            item['tiebaInfoId'] = titleName
            # item['reply_num'] = data['reply_num']
            # item['good'] = data['is_good']
            print("用户详情页跳转标识:{}".format(data['author_portrait']))

            # if not item['good']:
            #     item['good'] = False
            from scrapy.shell import inspect_response
            # 内容
            item['content'] = sel.xpath(
                './/div[contains(@class, "threadlist_title")]/a/@title'
            ).extract_first()
            if self.filter and not self.filter(
                    item["id"], item["title"], item['author'],
                    item['reply_num'], item['good']):
                continue
            print("filter的值:{}".format(filter))
            #filter过滤掉的帖子及其回复均不存入数据库

            url = 'http://tieba.baidu.com/p/%d' % data['id']
            if self.see_lz:
                url += '?see_lz=1'
            #处理帖子发帖时间请求的
            yield scrapy.Request(url,
                                 callback=self.parse_thread_time,
                                 cb_kwargs=dict(item))

            #获取对应的详情页信息放入楼层处理
            # print("调用用户详情.......................")
            # yield scrapy.Request(url, callback=self.parse_user_detail, meta=meta)

        next_page = response.xpath('//a[@class="next pagination-item "]/@href')
        self.cur_page += 1
        if next_page:
            if self.cur_page <= self.end_page:
                yield self.make_requests_from_url('http:' +
                                                  next_page.extract_first())
Esempio n. 5
0
    def parse_thread_time(self, response, tiebaAccountId, outContentId,
                          tiebaInfoId, content):

        print(
            "1楼的传递参数: tiebaAccountId:{},outContentId:{} tiebaInfoId:{}".format(
                tiebaAccountId, outContentId, tiebaInfoId))

        item = ThreadItem()
        # 贴吧账号id
        item['tiebaAccountId'] = tiebaAccountId
        # 外部内容id 帖子id
        item['outContentId'] = outContentId
        # 贴吧名就是贴吧id
        item['tiebaInfoId'] = tiebaInfoId
        #内容
        item['content'] = content

        #请求一次url详情页url地址获取第一个楼层就是帖子本身
        first_floor = response.xpath(
            "//div[contains(@class, 'l_post')]").extract_first()

        first_data_field = response.xpath(
            "//div[contains(@class, 'l_post')]/@data-field").extract_first()
        # print("first_floor的信息:{}".format(first_floor))
        #获取第一个信息来获取发帖时间
        # first_data_field = first_floor.xpath("//div/@data-field").extract_first()
        print("first_data_field的信息:{}".format(first_data_field))
        first_floor_data = json.loads(first_data_field)
        #初始化时间
        thread_time = None

        if 'date' in first_floor_data['content'].keys():
            thread_time = first_floor_data['content']['date']
            item['publishTime'] = thread_time
            # 只有以前的帖子, data-field里面才有date
        else:
            thread_time = first_floor.xpath(".//span[@class='tail-info']") \
                .re_first(r'[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}')

            item['publishTime'] = thread_time

        print("时间:{}".format(thread_time))
        #时间格式处理
        thread_time = self.dealTime(thread_time)

        created_at = datetime.strptime(thread_time, "%Y-%m-%d %H:%M:%S")
        since_date = datetime.strptime(self.since_date, "%Y-%m-%d %H:%M:%S")

        send_Flag = created_at < since_date

        if send_Flag:
            item['isSend'] = False
        else:
            item['isSend'] = True
        yield item

        #帖子不发送了 楼层没必要获取了
        if send_Flag:
            # yield item
            meta = {
                'threadId': outContentId,
                'tiebaInfoId': tiebaInfoId,
                'page': 1,
                self.const_active_tieba: {
                    tiebaInfoId: 0
                }
            }

            print("跳转帖子对应的楼层详情页URL:{}".format(response.url))
            yield scrapy.Request(response.url,
                                 callback=self.parse_post,
                                 meta=meta)