Esempio n. 1
0
 def parse(self, response):
     if response.url.endswith('page=1'):
         all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
         if all_page:
             all_page = all_page.group(1)
             all_page = int(all_page)
             for page_num in range(2, all_page + 1):
                 page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                 yield Request(page_url, self.parse, dont_filter=True, meta=response.meta)
     tree_node = etree.HTML(response.body)
     comment_nodes = tree_node.xpath('//div[@class="c" and contains(@id,"C_")]')
     for comment_node in comment_nodes:
         comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href')
         if not comment_user_url:
             continue
         comment_item = CommentItem()
         comment_item['crawl_time'] = int(time.time())
         comment_item['weibo_id'] = response.url.split('/')[-1].split('?')[0]
         comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url[0]).group(1)
         comment_item['content'] = extract_comment_content(etree.tostring(comment_node, encoding='unicode'))
         comment_item['_id'] = comment_node.xpath('./@id')[0]
         created_at_info = comment_node.xpath('.//span[@class="ct"]/text()')[0]
         like_num = comment_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1]
         comment_item['like_num'] = int(re.search('\d+', like_num).group())
         comment_item['created_at'] = time_fix(created_at_info.split('\xa0')[0])
         yield comment_item
Esempio n. 2
0
 def parse_comment(self, response):
     # 如果是第1页,一次性获取后面的所有页
     if response.url.endswith('page=1'):
         all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
         if all_page:
             all_page = all_page.group(1)
             all_page = int(all_page)
             for page_num in range(2, all_page + 1):
                 page_url = response.url.replace('page=1',
                                                 'page={}'.format(page_num))
                 yield Request(page_url,
                               self.parse_comment,
                               dont_filter=True,
                               meta=response.meta)
     selector = Selector(response)
     comment_nodes = selector.xpath(
         '//div[@class="c" and contains(@id,"C_")]')
     for comment_node in comment_nodes:
         comment_user_url = comment_node.xpath(
             './/a[contains(@href,"/u/")]/@href').extract_first()
         if not comment_user_url:
             continue
         comment_item = CommentItem()
         comment_item['crawl_time'] = int(time.time())
         comment_item['weibo_url'] = response.meta['weibo_url']
         comment_item['comment_user_id'] = re.search(
             r'/u/(\d+)', comment_user_url).group(1)
         comment_item['content'] = comment_node.xpath(
             './/span[@class="ctt"]').xpath('string(.)').extract_first()
         comment_item['_id'] = comment_node.xpath('./@id').extract_first()
         created_at = comment_node.xpath(
             './/span[@class="ct"]/text()').extract_first()
         comment_item['created_at'] = time_fix(created_at.split('\xa0')[0])
         yield comment_item
Esempio n. 3
0
 def parse(self, response):
     if response.url.endswith('page=1'):
         all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
         if all_page:
             all_page = all_page.group(1)
             all_page = int(all_page)
             all_page = all_page if all_page <= 50 else 50
             for page_num in range(2, all_page + 1):
                 page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                 yield Request(page_url, self.parse, dont_filter=True, meta=response.meta)
     tree_node = etree.HTML(response.body)
     repo_nodes = tree_node.xpath('//div[@class="c" and not(contains(@id,"M_"))]') 
     for repo_node in repo_nodes:
         repo_user_url = repo_node.xpath('.//a[contains(@href,"/u/")]/@href')
         if not repo_user_url:
             continue
         repo_item = RepostItem()
         #repo_item['_id'] = ''
         repo_item['crawl_time'] = int(time.time())
         repo_item['weibo_id'] = response.url.split('/')[-1].split('?')[0]
         repo_item['user_id'] = re.search(r'/u/(\d+)', repo_user_url[0]).group(1)
         content = extract_repost_content(etree.tostring(repo_node, encoding='unicode'))
         repo_item['content'] = content.split(':', maxsplit=1)[1]
         created_at_info = repo_node.xpath('.//span[@class="ct"]/text()')[0].split('\xa0')
         repo_item['created_at'] = time_fix((created_at_info[0]+created_at_info[1]))
         yield repo_item
    def parse_tweet(self, response):
        """获取微博信息"""
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, 805):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    print(page_url)
                    yield scrapy.Request(page_url,
                                         self.parse_tweet,
                                         dont_filter=True,
                                         meta=response.meta)
        """获取微博内容"""
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.cn/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info_node = tweet_node.xpath(
                    './/span[@class="ct"]')[-1]
                create_time_info = create_time_info_node.xpath('string(.)')
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                    tweet_item['tool'] = create_time_info.split(
                        '来自')[1].strip()
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())

                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())

                images = tweet_node.xpath('.//img[@alt="图片"]/@src')
                if images:
                    tweet_item['image_url'] = images[0]

                videos = tweet_node.xpath(
                    './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href'
                )
                if videos:
                    tweet_item['video_url'] = videos[0]

                map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]')
                if map_node:
                    map_node = map_node[0]
                    map_node_url = map_node.xpath('./@href')[0]
                    map_info = re.search(r'xy=(.*?)&', map_node_url).group(1)
                    tweet_item['location_map_info'] = map_info

                repost_node = tweet_node.xpath(
                    './/a[contains(text(),"原文评论[")]/@href')
                if repost_node:
                    tweet_item['origin_weibo'] = repost_node[0]

                # 检测有没有阅读全文:
                all_content_link = tweet_node.xpath(
                    './/a[text()="全文" and contains(@href,"ckAll=1")]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield scrapy.Request(all_content_url,
                                         callback=self.parse_all_content,
                                         meta={'item': tweet_item},
                                         priority=1)
                else:
                    tweet_html = etree.tostring(tweet_node, encoding='unicode')
                    tweet_item['content'] = extract_weibo_content(tweet_html)
                    yield tweet_item

                # # 抓取该微博的评论信息
                # comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1'
                # yield scrapy.Request(url=comment_url,
                #                      callback=self.parse_comment,
                #                      meta={'weibo_url': tweet_item['weibo_url']})
            except Exception as e:
                self.logger.error(e)
Esempio n. 5
0
    def parse(self, response):
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse,
                                  dont_filter=True,
                                  meta=response.meta)
                # 如果是搜索接口,按照天的粒度结果已经是100页,那继续按照小时的粒度进行切分
                if 'search/mblog' in response.url and all_page == 100 and '-' not in response.url:
                    start_time_string = re.search(
                        r'starttime=(\d+)&', unquote(response.url,
                                                     "utf-8")).group(1)
                    keyword = re.search(r'keyword=(.*?)&',
                                        unquote(response.url,
                                                "utf-8")).group(1)
                    self.logger.info(
                        f'split by hour,{start_time_string},{keyword}, {unquote(response.url, "utf-8")}'
                    )
                    date_start = datetime.datetime.strptime(
                        start_time_string, "%Y%m%d")
                    time_spread = datetime.timedelta(days=1)
                    url_format_by_hour = "https://weibo.cn/search/mblog?hideSearchFrame=&keyword={}&advancedfilter=1&starttime={}&endtime={}&sort=time&atten=1&page=1"
                    one_day_back = date_start - time_spread
                    # from today's 7:00-8:00am to 23:00-24:00am
                    for hour in range(7, 24):
                        # calculation rule of starting time: start_date 8:00am + offset:16
                        begin_hour = one_day_back.strftime(
                            "%Y%m%d") + "-" + str(hour + 16)
                        # calculation rule of ending time: (end_date+1) 8:00am + offset:-7
                        end_hour = one_day_back.strftime("%Y%m%d") + "-" + str(
                            hour - 7)
                        page_url = url_format_by_hour.format(
                            keyword, begin_hour, end_hour)
                        yield Request(page_url,
                                      self.parse,
                                      dont_filter=True,
                                      meta=response.meta)
                    two_day_back = one_day_back - time_spread
                    # from today's 0:00-1:00am to 6:00-7:00am
                    for hour in range(0, 7):
                        # note the offset change bc we are two-days back now
                        begin_hour = two_day_back.strftime(
                            "%Y%m%d") + "-" + str(hour + 40)
                        end_hour = two_day_back.strftime("%Y%m%d") + "-" + str(
                            hour + 17)
                        page_url = url_format_by_hour.format(
                            keyword, begin_hour, end_hour)
                        yield Request(page_url,
                                      self.parse,
                                      dont_filter=True,
                                      meta=response.meta)

        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = user_tweet_id.group(1)
                create_time_info_node = tweet_node.xpath(
                    './/span[@class="ct"]')[-1]
                create_time_info = create_time_info_node.xpath('string(.)')
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                    tweet_item['tool'] = create_time_info.split(
                        '来自')[1].strip()
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())

                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())

                images = tweet_node.xpath('.//img[@alt="图片"]/@src')
                if images:
                    tweet_item['image_url'] = images

                videos = tweet_node.xpath(
                    './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href'
                )
                if videos:
                    tweet_item['video_url'] = videos

                map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]')
                if map_node:
                    map_node = map_node[0]
                    map_node_url = map_node.xpath('./@href')[0]
                    map_info = re.search(r'xy=(.*?)&', map_node_url).group(1)
                    tweet_item['location_map_info'] = map_info

                repost_node = tweet_node.xpath(
                    './/a[contains(text(),"原文评论[")]/@href')
                if repost_node:
                    tweet_item['origin_weibo'] = repost_node[0]

                all_content_link = tweet_node.xpath(
                    './/a[text()="全文" and contains(@href,"ckAll=1")]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)
                else:
                    tweet_html = etree.tostring(tweet_node, encoding='unicode')
                    tweet_item['content'] = extract_weibo_content(tweet_html)
                    yield tweet_item

            except Exception as e:
                self.logger.error(e)
Esempio n. 6
0
    def parse(self, response):
        tweet_item = TweetItem()
        tweet_item['crawl_time'] = int(time.time())
        selector = Selector(response)

        # todo: 完成单个微博页面数据的爬取,可参考 tweet.py、fan.py 的实现
        #  页面布局参考 https://weibo.cn/comment/JltR1rvSK。
        '''
        tweet_item['crawl_time'] = ''
        tweet_repost_url = ''
        user_tweet_id = ''
        tweet_item['weibo_url'] = ''
        tweet_item['user_id'] = ''
        tweet_item['_id'] = ''
        tweet_item['created_at'] = ''
        tweet_item['tool'] = ''
        tweet_item['created_at'] = ''
        tweet_item['like_num'] = ''
        tweet_item['repost_num'] = ''
        tweet_item['comment_num'] = ''
        tweet_item['image_url'] = ''
        tweet_item['video_url'] = ''
        '''

        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse, dont_filter=True, meta=response.meta)

        tweet_item = TweetItem()
        tweet_item['crawl_time'] = int(time.time())
        tweet_repost_url = selector.xpath('.//a[contains(text(),"转发[")]/@href')[0]
        user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url)
        tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2),
                                                                           user_tweet_id.group(1))
        tweet_item['user_id'] = user_tweet_id.group(2)
        tweet_item['_id'] = user_tweet_id.group(1)
        create_time_info_node = selector.xpath('.//span[@class="ct"]')[-1]
        create_time_info = create_time_info_node.xpath('string(.)')
        if "来自" in create_time_info:
            tweet_item['created_at'] = time_fix(create_time_info.split('来自')[0].strip())
            tweet_item['tool'] = create_time_info.split('来自')[1].strip()
        else:
            tweet_item['created_at'] = time_fix(create_time_info.strip())

        like_num = selector.xpath('.//a[contains(text(),"赞[")]/text()')[-1]
        tweet_item['like_num'] = int(re.search('\d+', like_num).group())

        repost_num = selector.xpath('.//a[contains(text(),"转发[")]/text()')[-1]
        tweet_item['repost_num'] = int(re.search('\d+', repost_num).group())

        comment_num = selector.xpath(
            './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()')[-1]
        tweet_item['comment_num'] = int(re.search('\d+', comment_num).group())

        images = selector.xpath('.//img[@alt="图片"]/@src')
        if images:
            tweet_item['image_url'] = images

        videos = selector.xpath('.//a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href')
        if videos:
            tweet_item['video_url'] = videos

        map_node = selector.xpath('.//a[contains(text(),"显示地图")]')
        if map_node:
            map_node = map_node[0]
            map_node_url = map_node.xpath('./@href')[0]
            map_info = re.search(r'xy=(.*?)&', map_node_url).group(1)
            tweet_item['location_map_info'] = map_info

        repost_node = selector.xpath('.//a[contains(text(),"原文评论[")]/@href')
        if repost_node:
            tweet_item['origin_weibo'] = repost_node[0]

        all_content_link = selector.xpath('.//a[text()="全文" and contains(@href,"ckAll=1")]')
        if all_content_link:
            all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0]
            yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item},
                          priority=1)
        else:
            # 用于在其他函数中对内容进一步处理时,将已有数据放在 request.meta['item'] 中传递
            request_meta = response.meta
            request_meta['item'] = tweet_item
            yield tweet_item
Esempio n. 7
0
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_tweet,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        all_page = re.search(r'/>&nbsp;(\d+)/(\d+)页</div>', response.text)

        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info = tweet_node.xpath(
                    './/span[@class="ct" and contains(text(),"来自")]/text()')[0]
                tweet_item['created_at'] = time_fix(
                    create_time_info.split('来自')[0].strip())

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())

                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())

                tweet_repost_node = tweet_node.xpath(
                    './/span[@class="ctt"]')[0]
                tweet_original_node = tweet_node.xpath('.//span[@class="cmt"]')

                # 检测由没有阅读全文:
                all_content_link = tweet_repost_node.xpath('.//a[text()="全文"]')

                # 如果是原创微博,只有ctt
                # 如果是转发微博,ctt是转发微博的内容,cmt是转发理由
                if tweet_original_node:
                    repost_contemt = tweet_original_node[0].xpath('string(.)').strip().replace(u"\xa0", " ") + \
                                     tweet_repost_node.xpath('string(.)').replace('\u200b', '').replace('\u2028', '').strip()
                    content = re.findall(
                        '(.*?)//',
                        re.findall(r'转发理由:(.*?)赞',
                                   tweet_node.xpath('string(.)'))[0].replace(
                                       '\u2028', '').strip())
                    if content:
                        original_content = "转发理由:" + content[0]
                    else:
                        original_content = "转发理由:" + \
                                           re.findall(r'转发理由:(.*?)赞', tweet_node.xpath('string(.)'))[0]\
                                               .replace('\u2028', '').strip()
                    tweet_item[
                        'content'] = repost_contemt + "+" + original_content
                    yield tweet_item
                else:
                    if all_content_link:
                        all_content_url = self.base_url + all_content_link[
                            0].xpath('./@href')[0]
                        yield Request(all_content_url,
                                      callback=self.parse_all_content,
                                      meta={'item': tweet_item},
                                      priority=1)
                    else:
                        all_content = tweet_repost_node.xpath('string(.)').replace('\u200b', '')\
                            .replace('\u2028', '').strip()
                        tweet_item['content'] = all_content
                        yield tweet_item

                # tweet_content_node = tweet_node.xpath('.//span[@class="ctt"]')[0]

                # 检测由没有阅读全文:
                # all_content_link = tweet_content_node.xpath('.//a[text()="全文"]')
                # if all_content_link:
                #     all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0]
                #     yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item},
                #                   priority=1)
                #
                # else:
                #     all_content = tweet_content_node.xpath('string(.)').replace('\u200b', '').strip()
                #     tweet_item['content'] = all_content
                #     # yield tweet_item
                #     print(tweet_item)

                # 抓取该微博的评论信息
                # comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1'
                # yield Request(url=comment_url, callback=self.parse_comment,
                #               meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)

        if all_page:
            if response.url.endswith(all_page.group(2)):
                while True:
                    user = self.collection.find_one({'tweet_flag': 'false'})
                    if user is not None:
                        uid = user['_id']
                        yield Request(url="https://weibo.cn/%s/info" % uid,
                                      callback=self.parse_information)
                        self.collection.update_one(
                            {'_id': uid}, {'$set': {
                                'tweet_flag': 'true'
                            }})
                        break
                    else:
                        print("暂时没有可爬取的id")
                        time.sleep(10)
        else:
            while True:
                user = self.collection.find_one({'tweet_flag': 'false'})
                if user is not None:
                    uid = user['_id']
                    yield Request(url="https://weibo.cn/%s/info" % uid,
                                  callback=self.parse_information)
                    self.collection.update_one(
                        {'_id': uid}, {'$set': {
                            'tweet_flag': 'true'
                        }})
                    break
                else:
                    print("暂时没有可爬取的id")
                    time.sleep(10)