Esempio n. 1
0
    def parse_tweet(self, response):
        page_url = response.url
        tweet_item = TweetsItem()
        tree_node = etree.HTML(response.body)
        tweet_content_node = tree_node.xpath('.//span[@class="ctt"]')[0]
        all_content = tweet_content_node.xpath('string(.)').strip('\u200b')
        tweet_item['content'] = all_content
        tweet_item['crawl_time'] = int(time.time())

        user_tweet_id = re.search(r'https://weibo.cn/(\d+)/(.*)', page_url)
        tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
            user_tweet_id.group(1), user_tweet_id.group(2))
        tweet_item['user_id'] = user_tweet_id.group(1)
        tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                           user_tweet_id.group(1))
        create_time_info = tree_node.xpath(
            './/span[@class="ct" and contains(text(),"来自")]/text()')[0]
        tweet_item['created_at'] = time_fix(
            create_time_info.split('来自')[0].strip())
        like_num = tree_node.xpath('.//a[contains(text(),"赞[")]/text()')[0]
        tweet_item['like_num'] = int(re.search('\d+', like_num).group())
        repost_num = tree_node.xpath('.//a[contains(text(),"转发[")]/text()')[0]
        tweet_item['repost_num'] = int(re.search('\d+', repost_num).group())
        comment_num = tree_node.xpath(
            './/span[@class="pms" and contains(text(),"评论[")]/text()')[0]
        tweet_item['comment_num'] = int(re.search('\d+', comment_num).group())
        yield tweet_item
        comment_url = page_url + '?page=1'
        yield Request(url=comment_url,
                      callback=self.parse_comment,
                      meta={'weibo_url': page_url})
 def parse_comment(self, response):
     # 如果是第1页,一次性获取后面的所有页
     if response.url.endswith('page=1'):
         all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
         if all_page:
             all_page = all_page.group(1)
             all_page = int(all_page)
             for page_num in range(2, all_page + 1):
                 page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                 yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta)
     tree_node = etree.HTML(response.body)
     comment_nodes = tree_node.xpath('//div[@class="c" and contains(@id,"C_")]')
     for comment_node in comment_nodes:
         comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href')
         if not comment_user_url:
             continue
         comment_item = CommentItem()
         comment_item['crawl_time'] = int(time.time())
         comment_item['weibo_url'] = response.meta['weibo_url']
         comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url[0]).group(1)
         comment_item['content'] = extract_comment_content(etree.tostring(comment_node, encoding='unicode'))
         comment_item['_id'] = comment_node.xpath('./@id')[0]
         created_at_info = comment_node.xpath('.//span[@class="ct"]/text()')[0]
         like_num = comment_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1]
         comment_item['like_num'] = int(re.search('\d+', like_num).group())
         comment_item['created_at'] = time_fix(created_at_info.split('\xa0')[0])
         yield comment_item
Esempio n. 3
0
 def parse_comment(self, response):
     # 如果是第1页,一次性获取后面的所有页
     if response.url.endswith('page=1'):
         all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
         if all_page:
             all_page = all_page.group(1)
             all_page = int(all_page)
             for page_num in range(2, all_page + 1):
                 page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                 yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta)
     selector = Selector(response)
     comment_nodes = selector.xpath('//div[@class="c" and contains(@id,"C_")]')
     for comment_node in comment_nodes:
         comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href').extract_first()
         if not comment_user_url:
             continue
         comment_item = CommentItem()
         comment_item['crawl_time'] = int(time.time())
         comment_item['weibo_url'] = response.meta['weibo_url']
         comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url).group(1)
         comment_item['content'] = comment_node.xpath('.//span[@class="ctt"]').xpath('string(.)').extract_first()
         comment_item['_id'] = comment_node.xpath('./@id').extract_first()
         created_at = comment_node.xpath('.//span[@class="ct"]/text()').extract_first()
         comment_item['created_at'] = time_fix(created_at.split('\xa0')[0])
         yield comment_item
Esempio n. 4
0
 def parse_comment(self, response):
     # 如果是第1页,一次性获取后面的所有页
     if response.url.endswith('page=1'):
         all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
         if all_page:
             all_page = all_page.group(1)
             all_page = int(all_page)
             for page_num in range(2, all_page + 1):
                 page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                 yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta)
     selector = Selector(response)
     comment_nodes = selector.xpath('//div[@class="c" and contains(@id,"C_")]')
     for comment_node in comment_nodes:
         comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href').extract_first()
         if not comment_user_url:
             continue
         comment_item = CommentItem()
         comment_item['crawl_time'] = int(time.time())
         comment_item['weibo_url'] = response.meta['weibo_url']
         comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url).group(1)
         comment_item['content'] = comment_node.xpath('.//span[@class="ctt"]').xpath('string(.)').extract_first()
         comment_item['_id'] = comment_node.xpath('./@id').extract_first()
         created_at = comment_node.xpath('.//span[@class="ct"]/text()').extract_first()
         comment_item['created_at'] = time_fix(created_at.split('\xa0')[0])
         yield comment_item
Esempio n. 5
0
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath('.//a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2),
                                                                           user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1))
                create_time_info = tweet_node.xpath('.//span[@class="ct" and contains(text(),"来自")]/text()')[0]
                tweet_item['created_at'] = time_fix(create_time_info.split('来自')[0].strip())

                like_num = tweet_node.xpath('.//a[contains(text(),"赞[")]/text()')[0]
                tweet_item['like_num'] = int(re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath('.//a[contains(text(),"转发[")]/text()')[0]
                tweet_item['repost_num'] = int(re.search('\d+', repost_num).group())

                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()')[0]
                tweet_item['comment_num'] = int(re.search('\d+', comment_num).group())

                tweet_content_node = tweet_node.xpath('.//span[@class="ctt"]')[0]

                # 检测由没有阅读全文:
                all_content_link = tweet_content_node.xpath('.//a[text()="全文"]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0]
                    yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item},
                                  priority=1)

                else:
                    all_content = tweet_content_node.xpath('string(.)').strip('\u200b')
                    tweet_item['content'] = all_content
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)
Esempio n. 6
0
    def parse_rep(self, response):
        # 如果是第1页,一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_rep,
                                  dont_filter=True,
                                  meta=response.meta)
        selector = Selector(response)
        comment_nodes = selector.xpath('//div[@class="c"]')
        for comment_node in comment_nodes:
            try:
                comment_user_url = comment_node.xpath(
                    './/a[contains(@href,"/u/")]/@href').extract_first()
                if not comment_user_url:
                    continue
                comment_item = RepostItem()
                comment_item['crawl_time'] = int(time.time())
                comment_item['weibo_url'] = response.meta['weibo_url']
                uid = re.search(r'/u/(\d+)', comment_user_url).group(1)
                comment_item['rep_user_id'] = uid

                # 给comment_info 用来抓转发用户论信息
                yield Request(url="https://weibo.cn/%s/info" % uid,
                              callback=self.parse_rep_information)
                content = comment_node.xpath('./text()').extract_first()
                # at_list = comment_node.xpath('.//span[@class="ctt"]').xpath('a').xpath('string(.)').extract()
                # for a in at_list:
                #     content = content.replace(a, '@*' + a + '*@')
                comment_item['content'] = content
                # comment_item['_id'] = comment_node.xpath('./@id').extract_first()
                # create_at = '\xa012月20日 22:28\xa0来自红米Note7 4800万相机'
                created_at = comment_node.xpath(
                    './/span[@class="ct"]/text()').extract_first()
                comment_item['created_at'] = time_fix(
                    created_at.split('\xa0')[1])
            except:
                pass
            else:
                try:
                    comment_item['created_device'] = created_at.split(
                        '\xa0')[2]
                except:
                    pass
                else:
                    pass
                yield comment_item
Esempio n. 7
0
 def parse_comment(self, response):
     # 如果是第1页,一次性获取后面的所有页
     if response.url.endswith('page=1'):
         all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
         if all_page:
             all_page = all_page.group(1)
             all_page = int(all_page)
             if (all_page > self.MAX_COMMENT_PAGES):
                 all_page = self.MAX_COMMENT_PAGES
             for page_num in range(2, all_page + 1):
                 page_url = response.url.replace('page=1',
                                                 'page={}'.format(page_num))
                 yield Request(page_url,
                               self.parse_comment,
                               dont_filter=True,
                               meta=response.meta)
     selector = Selector(response)
     comment_nodes = selector.xpath(
         '//div[@class="c" and contains(@id,"C_")]')
     for comment_node in comment_nodes:
         comment_user_url = comment_node.xpath(
             './/a[contains(@href,"/u/")]/@href').extract_first()
         if not comment_user_url:
             continue
         comment_item = CommentItem()
         comment_item['crawl_time'] = int(time.time())
         comment_item['weibo_url'] = response.meta['weibo_url']
         comment_item['comment_user_id'] = re.search(
             r'/u/(\d+)', comment_user_url).group(1)
         comment_item['content'] = comment_node.xpath(
             './/span[@class="ctt"]').xpath('string(.)').extract_first()
         created_at = comment_node.xpath(
             './/span[@class="ct"]/text()').extract_first()
         comment_item['created_at'] = time_fix(created_at.split('\xa0')[0])
         try:
             s = SnowNLP(comment_item['content'])
             comment_item['sentiments'] = str(s.sentiments * 10)[0:8]
         except:
             comment_item['sentiments'] = '5.0'
         try:
             sql = "INSERT INTO `sbhdb`.`weibo_comment`( `comment_user_id`, `content`, `weibo_url`, `created_at`, `crawl_time`, `sentiments`) VALUES ( '%s', '%s', '%s', '%s', %s,%s)" % (
                 comment_item['comment_user_id'], comment_item['content'],
                 comment_item['weibo_url'], comment_item['created_at'],
                 comment_item['crawl_time'], comment_item['sentiments'])
             self.cursor.execute(sql)
             self.db.commit()
         except:
             # 数据有重复
             pass
         yield comment_item
Esempio n. 8
0
    def parse_repost(self, response):
        # 如果是第1页,一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_repost,
                                  dont_filter=True,
                                  meta=response.meta,
                                  priority=2)
        tree_node = etree.HTML(response.body)
        repost_nodes = tree_node.xpath(
            '//div[@class="c" and not(@id="M_") and .//span[contains(@class,"cc")]]'
        )
        for repost_node in repost_nodes:
            repost_user_url = repost_node.xpath(
                './/a[contains(@href,"/")]/@href')
            if not repost_user_url:
                continue
            repost_item = RepostItem()
            repost_item['crawl_time_utc'] = dt.utcnow()
            repost_item['weibo_url'] = response.meta['weibo_url']
            repost_item['repost_user_id'] = re.search(
                r'(/u/(\d+))|(/(\w+))', repost_user_url[0]).group(0)
            repost_item['content'] = repost_node.xpath('.//text()')[0]
            like_num = repost_node.xpath(
                './/a[contains(text(),"赞[")]/text()')[-1]
            repost_item['like_num'] = int(re.search('\d+', like_num).group())
            created_at_info = repost_node.xpath(
                './/span[@class="ct"]/text()')[0]
            #print("[DEBUG] repost CT:"+created_at_info)
            #print("[DEBUG] repost CT:"+created_at_info.split('\xa0')[0])
            repost_item['created_at'] = time_fix(
                created_at_info.strip('\xa0').split('\xa0')[0])
            #print("[DEBUG] repost CT:"+repost_item['created_at'])
            repost_item['_id'] = repost_item['repost_user_id'] + repost_item[
                'weibo_url']
            hot_repost = repost_node.xpath('.//span[@class="kt"]/text()')
            if hot_repost:
                repost_item['hot_repost'] = True
            else:
                repost_item['hot_repost'] = False

            yield repost_item
Esempio n. 9
0
 def parse_repost(self, response):
    # 如果是第1页,一次性获取后面的所有页
     if response.url.endswith('page=1'):
         all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
         if all_page:
             all_page = all_page.group(1)
             all_page = int(all_page)
             for page_num in range(2, all_page + 1):
                 page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                 yield Request(page_url, self.parse_repost, dont_filter=True, meta=response.meta)
     selector = Selector(response)
     repost_nodes = selector.xpath('//div[@class="c"]')
     for repost_node in repost_nodes:
         repost_user_url = repost_node.xpath('.//a[contains(@href,"/u/")]/@href').extract_first()
         if not repost_user_url:
             continue
         repost_item = RepostItem()
         repost_item['crawl_time'] = int(time.time())
         repost_item['weibo_url'] = response.meta['weibo_url']
         repost_item['repost_user_id'] = re.search(r'/u/(\d+)', repost_user_url).group(1)
         attitude_text = repost_node.xpath('.//span[@class="cc"]/a[contains(@href,"/attitude/")]//text()').extract_first()
         #转发点赞
         attitude = re.findall('赞\[(\d+)\]', str(attitude_text))
         if attitude is not None and len(attitude)>=1:
             repost_item['attitude'] = int(attitude[0])
         #转发评论
         text1 = ";".join(repost_node.xpath('.//text()').extract())
         comment = re.findall(';:(.*?);', text1)
         if comment and comment[0]:
             repost_item["comment"] = comment[0].replace(u"\xa0", "")
         created_at = repost_node.xpath('.//span[@class="ct"]/text()').extract_first()
         if created_at is not None:
             #转发时间
             repost_item['created_at'] = time_fix(created_at.split('\xa0')[1])
             #转发设备
             repost_item['device'] = created_at.split('\xa0')[2]
         repost_item['weibo_user_id'] = re.search(r'.*?com/((\d+))/',repost_item['weibo_url']).group(1)
         repost_item['_id'] = repost_item['repost_user_id'] +'-'+ repost_item['weibo_user_id']
         yield Request(url="https://weibo.cn/{}/info".format(repost_item['repost_user_id']),
                       callback=self.parse_information, priority=1)
         yield repost_item
Esempio n. 10
0
    def parse_like(self, response):
        # 如果是第1页,一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_comment,
                                  dont_filter=True,
                                  meta=response.meta)
        selector = Selector(response)
        comment_nodes = selector.xpath('//div[@class="c"]')
        for comment_node in comment_nodes:
            try:
                comment_user_url = comment_node.xpath(
                    './/a[contains(@href,"/u/")]/@href').extract_first()
            except:
                pass
            else:
                if not comment_user_url:
                    continue
                comment_item = LikeItem()
                comment_item['crawl_time'] = int(time.time())
                comment_item['weibo_url'] = response.meta['weibo_url']
                uid = re.search(r'/u/(\d+)', comment_user_url).group(1)

                comment_item['like_user_id'] = uid
                # 给comment_info 用来抓评论人信息
                yield Request(url="https://weibo.cn/%s/info" % uid,
                              callback=self.parse_like_information)
                created_at = comment_node.xpath(
                    './/span[@class="ct"]/text()').extract_first()
                comment_item['created_at'] = time_fix(
                    created_at.split('\xa0')[0])
                yield comment_item
Esempio n. 11
0
    def parse(self, response):

        current_page = int(response.url.split("page=")[-1])
        print("[INFO] Crawling Tweets Page: "+str(current_page))
        print("[INFO Crawling URL: " + response.url)


        selector = Selector(response)
        tweetpage_item = TimelinePageRaw()
        tweetpage_item['user_id'] = re.findall("(\d+)\?page",response.url)[0]
        tweetpage_item['page_url'] = re.sub("https://.*?/fireprox",self.weibo_baseurl,response.url)
        tweetpage_item['page_raw'] = selector.extract() # get raw page content
        tweetpage_item['crawl_time_utc'] = dt.utcnow()
        yield tweetpage_item

        time_stop_flag = 0 # stop crawling if hit specified start time

        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        if len(tweet_nodes) < 1: # no information on this page
            update = ProfileUpdateItem()
            update["timelineCrawlJob_current_complete"] = True
            update["timelineCrawlJob_current_page"] = current_page
            update["timelineCrawlJob_run_history"] = tweetpage_item['crawl_time_utc']
            update["uid"] = tweetpage_item['user_id']
            yield update
            return

        for tweet_node in tweet_nodes:
            try:
                create_time_info_node = tweet_node.xpath('.//span[@class="ct"]')[-1]
                create_time_info = create_time_info_node.xpath('string(.)')
                print("[INFO] create_time_info raw: " + create_time_info)
                if "来自" in create_time_info:
                    created_at = time_fix(create_time_info.split('来自')[0].strip())
                    time_stop_flag = self.time_flag_compare(created_at) # time compare to trigger stop flag
                    
                else:
                    created_at = time_fix(create_time_info.strip())
                    time_stop_flag = self.time_flag_compare(created_at) # time compare to trigger stop flag

            except Exception as e:
                self.logger.error(e)

        #  keep looping until hit page with time range limit
        
        print("[DEBUG] timeflag:" + str(time_stop_flag))
        update = ProfileUpdateItem()
        if time_stop_flag == 0: 
            next_page = current_page + 1
            page_url = self.get_base_url() + '/{}?page={}'.format(tweetpage_item['user_id'],next_page)
            #page_url = response.url.replace('page='+str(current_page), 'page={}'.format(next_page))
            update["timelineCrawlJob_current_page"] = current_page
            update["timelineCrawlJob_current_complete"] = False
            update["uid"] = tweetpage_item['user_id']
            yield update
            yield Request(page_url, self.parse, dont_filter=True, meta=response.meta,priority=1)
        else:
            update["timelineCrawlJob_current_complete"] = True
            update["timelineCrawlJob_run_history"] = tweetpage_item['crawl_time_utc']
            update["timelineCrawlJob_current_page"] = current_page
            update["uid"] = tweetpage_item['user_id']
            yield update
Esempio n. 12
0
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath('.//a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2),
                                                                           user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1))
                create_time_info = tweet_node.xpath('.//span[@class="ct"]/text()')[-1]
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(create_time_info.split('来自')[0].strip())
                else:
                    tweet_item['created_at'] = time_fix(create_time_info.strip())

                like_num = tweet_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath('.//a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(re.search('\d+', repost_num).group())

                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()')[-1]
                tweet_item['comment_num'] = int(re.search('\d+', comment_num).group())

                tweet_content_node = tweet_node.xpath('.//span[@class="ctt"]')[0]

                # 检测由没有阅读全文:
                all_content_link = tweet_content_node.xpath('.//a[text()="全文"]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0]
                    yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item},
                                  priority=1)

                else:
                    all_content = tweet_content_node.xpath('string(.)').replace('\u200b', '').strip()
                    tweet_item['content'] = all_content
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)
Esempio n. 13
0
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_tweet,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info_node = tweet_node.xpath(
                    './/span[@class="ct"]')[-1]
                create_time_info = create_time_info_node.xpath('string(.)')
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                    tweet_item['tool'] = create_time_info.split(
                        '来自')[1].strip()
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())

                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())

                images = tweet_node.xpath('.//img[@alt="图片"]/@src')
                if images:
                    tweet_item['image_url'] = images[0]

                videos = tweet_node.xpath(
                    './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href'
                )
                if videos:
                    tweet_item['video_url'] = videos[0]

                map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]')
                if map_node:
                    map_node = map_node[0]
                    map_node_url = map_node.xpath('./@href')[0]
                    map_info = re.search(r'xy=(.*?)&', map_node_url).group(1)
                    tweet_item['location_map_info'] = map_info

                repost_node = tweet_node.xpath(
                    './/a[contains(text(),"原文评论[")]/@href')
                if repost_node:
                    tweet_item['origin_weibo'] = repost_node[0]

                # 检测由没有阅读全文:
                all_content_link = tweet_node.xpath(
                    './/a[text()="全文" and contains(@href,"ckAll=1")]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    tweet_html = etree.tostring(tweet_node, encoding='unicode')
                    tweet_item['content'] = extract_weibo_content(tweet_html)
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url,
                              callback=self.parse_comment,
                              meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_tweet,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        # 总爬虫数加一
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                self.total_scrap_num += 1
                tweet_item['dataset_id'] = self.dataset_id
                tweet_item['blogger_id'] = self.blogger_id
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info = tweet_node.xpath(
                    './/span[@class="ct"]/text()')[-1]
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                # 设置爬虫终点,最多爬几天前的微博, 最多爬多少条
                time_now = datetime.datetime.now()
                created_time = datetime.datetime.strptime(
                    tweet_item['created_at'], "%Y-%m-%d %H:%M")
                if (
                        time_now - created_time
                ).days > MAX_INTERVAL & self.total_scrap_num > MAX_SCRAP_NUM:
                    # TODO 删除这条Twitter记录
                    mongodb_operation.delete_twitter_rec(
                        weibo_url=tweet_item['weibo_url'],
                        dataset_id=self.dataset_id)
                    return

                # TODO 假如微博已经存在, 则删除过去微博记录以及评论记录
                mongodb_operation.delete_previous_twitter_rec(
                    weibo_url=tweet_item['weibo_url'],
                    current_dataset_id=self.dataset_id)
                mongodb_operation.delete_previous_comment_under_twitter(
                    weibo_url=tweet_item['weibo_url'],
                    current_dataset_id=self.dataset_id)

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())

                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())

                tweet_content_node = tweet_node.xpath(
                    './/span[@class="ctt"]')[0]

                # 检测由没有阅读全文:
                all_content_link = tweet_content_node.xpath(
                    './/a[text()="全文"]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    all_content = tweet_content_node.xpath(
                        'string(.)').replace('\u200b', '').strip()
                    tweet_item['content'] = all_content[0:]
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url,
                              callback=self.parse_comment,
                              meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)
Esempio n. 15
0
    def parse(self, response):
        """
        解析搜索页面
        :param response:
        :return:
        """
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info = tweet_node.xpath(
                    './/span[@class="ct"]/text()')[-1]
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())
                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())
                tweet_content_node = tweet_node.xpath(
                    './/span[@class="ctt"]')[0]

                # TODO 这里加上获取提问者的user_id的解析
                # 换一下顺序,放到回答者的request后面,不行,这里的asker_name_url并不包含info
                # 通过xpath结合正则表达式提取提问者的user_id,或者直接就是他的个人页面,但是这里获得的是以昵称为url的,跳转之后返回的就是id了
                asker_name_urltxt = tweet_node.xpath(
                    './/a[contains(text(),"@")]/text()')[0]
                asker_name_url = self.base_url + tweet_node.xpath(
                    './/a[contains(text(),"@")]/@href')[0]
                # print('提问者的url', asker_name_url)
                tweet_item['asker_name'] = asker_name_urltxt.split('@')[-1]
                # asker_name_url = self.base_url + asker_name_url
                # print('提问者的昵称', tweet_item['asker_name'])
                response_nickname = requests.get(asker_name_url)
                response_url = response_nickname.url
                if 'weibo.cn/u/' in response_url:
                    nickname_id = response_url.split('weibo.cn/u/')[-1]
                else:
                    nickname_id = response_url.split('uid=')[-1]
                # TODO 这里yield一个提问者的request
                # https://blog.csdn.net/rgc_520_zyl/article/details/78946974
                # header = {,'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
                # asker 表的 _id 应该是tweet _id
                yield Request(
                    url="https://weibo.cn/{}/info".format(nickname_id),
                    callback=self.parse_information,
                    priority=3,
                    meta={'asker_from': tweet_item['weibo_url']})

                # 检测由没有阅读全文:
                all_content_link = tweet_content_node.xpath(
                    './/a[text()="全文"]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    all_content = tweet_content_node.xpath(
                        'string(.)').replace('\u200b', '').strip()
                    tweet_item['content'] = all_content[1:]
                    yield tweet_item

                # yield Request(url="https://weibo.cn/{}/info".format(tweet_item['user_id']),
                #               callback=self.parse_information, priority=2)
                yield Request(url="https://weibo.cn/{}/info".format(
                    tweet_item['user_id']),
                              callback=self.parse_information,
                              priority=2)

                # TODO 检测有无评论,如果有yield一个parse_comment
                if tweet_item['comment_num'] > 0:
                    # 抓取该微博的评论信息
                    comment_url = self.base_url + '/comment/' + tweet_item[
                        'weibo_url'].split('/')[-1] + '?page=1'
                    yield Request(url=comment_url,
                                  callback=self.parse_comment,
                                  meta={'weibo_url': tweet_item['weibo_url']},
                                  priority=5)

            except Exception as e:
                self.logger.error(e)
Esempio n. 16
0
    def parse(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            # />&nbsp:html中的空格占位符
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        # 选取所有的div元素+属性class=c+拥有id属性
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                # todo 转发和评论的url
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                tweet_cmt_url = tweet_node.xpath(
                    './/a[contains(text(),"评论[")]/@href')[0]
                # 发送微博的用户id
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                # _id作为微博的id
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info = tweet_node.xpath(
                    './/span[@class="ct"]/text()')[-1]
                # 去掉时间后面的部分 比如来自新浪微博/来自iphone
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())
                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())
                tweet_content_node = tweet_node.xpath(
                    './/span[@class="ctt"]')[0]

                # 检测有没有阅读全文:
                all_content_link = tweet_content_node.xpath(
                    './/a[text()="全文"]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    all_content = tweet_content_node.xpath(
                        'string(.)').replace('\u200b', '').strip()
                    tweet_item['content'] = all_content[1:]
                    yield tweet_item
                    # todo

                # 爬取评论用户信息和评论内容
                if tweet_item['comment_num'] > 0:
                    yield Request(url=tweet_cmt_url,
                                  callback=self.parse_cmt_info,
                                  meta={'weibo_id': tweet_item['_id']},
                                  priority=3)

                # # 爬取发微博的用户信息
                # yield Request(url="https://weibo.cn/{}/info".format(tweet_item['user_id']),
                #               callback=self.parse_information, priority=1)

                # todo 爬去转发用户的信息
                # todo 爬去转发用户的id和评论,赞数,加上微博的id可以匹配

            except Exception as e:
                self.logger.error(e)
Esempio n. 17
0
    def parse_cmt_info(self, response):
        if not response.url.__contains__('page'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url + '&page=' + str(page_num)
                    yield Request(page_url,
                                  self.parse_cmt_info,
                                  meta=response.meta,
                                  priority=2)  # todo 需不需要 meta=response.meta
        """
        解析评论页面
        """
        cmt_tree_node = etree.HTML(response.body)
        cmt_nodes = cmt_tree_node.xpath('//div[@class="c" and @id]')
        weibo_id = response.meta['weibo_id']
        for cmt_node in cmt_nodes:
            try:
                cmt = CommentItem()
                cmt['weibo_id'] = weibo_id

                cmt['_id'] = cmt_node.xpath('./@id')[0]
                user_url = cmt_node.xpath('.//a/@href')[0]
                cmt['comment_user_id'] = re.search(r'\d+', user_url).group()

                create_time_info = cmt_node.xpath(
                    './/span[@class="ct"]/text()')[-1]
                # 去掉时间后面的部分 比如来自新浪微博/来自iphone
                if "来自" in create_time_info:
                    cmt['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                else:
                    cmt['created_at'] = time_fix(create_time_info.strip())
                isreply = cmt_node.xpath('.//span[@class="ctt"]/text()')
                # cmt['content'] = contents.xpath('string(.)').replace('\u200b', '').strip()
                # 去掉text()[0]的":"
                content_node = cmt_node.xpath('.//span[@class="ctt"]')[0]
                all_content = content_node.xpath('string(.)').replace(
                    '\u200b', '').strip()
                # todo
                # if isreply[0].__contains__('回复'):
                #     cmt_node['reply_id'] = content_node.xpath('.//a/@href')[0]
                #     print('+++++++++++++++++++')
                #     print(cmt_node['reply_id'])
                cmt['content'] = all_content

                like_num = cmt_node.xpath(
                    './/span[@class="cc"]/a[contains(text(),"赞[")]/text()')[0]
                cmt['like_num'] = int(re.search(r'\d+', like_num).group())
                cmt['crawl_time'] = int(time.time())

                # 评论用户的信息
                yield Request(url="https://weibo.cn/{}/info".format(
                    cmt['comment_user_id']),
                              callback=self.parse_info,
                              priority=3,
                              meta={'cmt_item': cmt})

            except Exception as e:
                print('error happened ar comment parsing')
                self.logger.error(e)
Esempio n. 18
0
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                if (all_page > self.MAX_WEIBO_PAGES):
                    all_page = self.MAX_WEIBO_PAGES
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_tweet,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                create_time_info = tweet_node.xpath(
                    './/span[@class="ct"]/text()')[-1]
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())
                #时间最低日期
                if (tweet_item['created_at'] < self.MIN_WEIBO_DATE):
                    1
                else:
                    like_num = tweet_node.xpath(
                        './/a[contains(text(),"赞[")]/text()')[-1]
                    tweet_item['like_num'] = int(
                        re.search('\d+', like_num).group())

                    repost_num = tweet_node.xpath(
                        './/a[contains(text(),"转发[")]/text()')[-1]
                    tweet_item['repost_num'] = int(
                        re.search('\d+', repost_num).group())

                    comment_num = tweet_node.xpath(
                        './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                    )[-1]
                    tweet_item['comment_num'] = int(
                        re.search('\d+', comment_num).group())

                    # 检测由没有阅读全文:
                    all_content_link = tweet_node.xpath(
                        './/a[text()="全文" and contains(@href,"ckAll=1")]')
                    if all_content_link:
                        all_content_url = self.base_url + all_content_link[
                            0].xpath('./@href')[0]
                        yield Request(all_content_url,
                                      callback=self.parse_all_content,
                                      meta={'item': tweet_item},
                                      priority=1)

                    else:
                        all_content_text = tweet_node.xpath('string(.)')
                        if '转发理由:' in all_content_text:
                            all_content_text = all_content_text.split(
                                '转发理由:')[1]
                        all_content_text = all_content_text.split(
                            '\xa0', maxsplit=1)[0]
                        tweet_item['content'] = all_content_text.strip()
                        try:
                            s = SnowNLP(tweet_item['content'])
                            tweet_item['sentiments'] = str(s.sentiments *
                                                           10)[0:8]
                        except:
                            tweet_item['sentiments'] = '5.0'

                        try:
                            sql = "INSERT INTO `sbhdb`.`weibo_info`( `weibo_url`, `user_id`, `content`, `created_at`, `repost_num`, `comment_num`, `like_num`, `crawl_time`, `sentiments`) VALUES ('%s', '%s', '%s', '%s', %s,%s, %s, %s,%s)" % (
                                tweet_item['weibo_url'], tweet_item['user_id'],
                                tweet_item['content'],
                                tweet_item['created_at'],
                                tweet_item['repost_num'],
                                tweet_item['comment_num'],
                                tweet_item['like_num'],
                                tweet_item['crawl_time'],
                                tweet_item['sentiments'])
                            self.cursor.execute(sql)
                            self.db.commit()
                        except:
                            # 数据有重复
                            continue
                            pass
                    yield tweet_item

                    # 抓取该微博的评论信息
                    comment_url = self.base_url + '/comment/' + tweet_item[
                        'weibo_url'].split('/')[-1] + '?page=1'
                    yield Request(url=comment_url,
                                  callback=self.parse_comment,
                                  meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)
Esempio n. 19
0
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # if page 1, get all page number
            self.current_page = 1
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                self.all_page_num = all_page
        print("[INFO] Crawling Tweets Page: " + str(self.current_page))
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time_utc'] = dt.utcnow(
                )  # insert datetime timestamp utc
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                # if tweet_item['user_id']:
                #     print("[DEBUG] user_id:" + str(tweet_item['user_id']))
                # else:
                #     print("[DEBUG] user_id ERROR")

                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info_node = tweet_node.xpath(
                    './/span[@class="ct"]')[-1]
                create_time_info = create_time_info_node.xpath('string(.)')
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                    self.time_stop_flag = self.time_flag_compare(
                        tweet_item['created_at']
                    )  # time compare to trigger stop flag
                    tweet_item['tool'] = create_time_info.split(
                        '来自')[1].strip()
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())
                    self.time_stop_flag = self.time_flag_compare(
                        tweet_item['created_at']
                    )  # time compare to trigger stop flag
                    tweet_item['tool'] = ""

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())
                #print("[DEBUG] like_num:" + str(tweet_item['like_num']))
                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())
                #print("[DEBUG] repost_num:" + str(tweet_item['repost_num']))
                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())
                #print("[DEBUG] comment_num:" + str(tweet_item['comment_num']))
                # Add to grab all images 1) test if multi images link exists 2) if not use the
                multi_img_link = tweet_node.xpath(
                    './/a[contains(text(),"组图")]/@href')
                if multi_img_link:
                    #print("[DEBUG] multi_img_link:" + multi_img_link[-1])
                    tweet_item['multi_imgs'] = True
                    yield Request(url=multi_img_link[-1],
                                  callback=self.parse_multi_images,
                                  meta={'_id': tweet_item['_id']},
                                  priority=1)
                else:
                    tweet_item['multi_imgs'] = False

                images = tweet_node.xpath('.//img[@alt="图片"]/@src')
                if images:
                    tweet_item['image_url'] = images[0]
                else:
                    tweet_item['image_url'] = "NA"

                videos = tweet_node.xpath(
                    './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href'
                )
                if videos:
                    tweet_item['video_url'] = videos[0]
                else:
                    tweet_item['video_url'] = "NA"

                map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]')
                if map_node:
                    map_node = map_node[0]
                    map_node_url = map_node.xpath('./@href')[0]
                    map_info = re.search(r'xy=(.*?)&', map_node_url).group(1)
                    tweet_item['location_map_info'] = map_info
                else:
                    tweet_item['location_map_info'] = "NA"

                repost_node = tweet_node.xpath(
                    './/a[contains(text(),"原文评论[")]/@href')
                if repost_node:
                    tweet_item['retweet'] = True
                    tweet_item['origin_weibo'] = repost_node[0]
                    # crawl original weibo
                    # origin_weibo_url = self.base_url + '/repost/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1'
                    # yield Request(url=repost_url, callback=self.parse_repost, meta={'weibo_url': tweet_item['weibo_url']},priority = 2)

                else:
                    tweet_item['retweet'] = False
                    tweet_item['origin_weibo'] = "NA"
                # 检测由没有阅读全文:
                all_content_link = tweet_node.xpath(
                    './/a[text()="全文" and contains(@href,"ckAll=1")]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    tweet_html = etree.tostring(tweet_node, encoding='unicode')
                    tweet_item['content'] = extract_weibo_content(tweet_html)
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url,
                              callback=self.parse_comment,
                              meta={'weibo_url': tweet_item['weibo_url']},
                              priority=2)

                # Crawl tweet repost
                repost_url = self.base_url + '/repost/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=repost_url,
                              callback=self.parse_repost,
                              meta={'weibo_url': tweet_item['weibo_url']},
                              priority=2)

            except Exception as e:
                self.logger.error(e)

        #  keep looping until hit page with time range limit
        self.current_page = self.current_page + 1
        if self.time_stop_flag == 0 and self.current_page < (
                self.all_page_num + 1) and self.current_page >= 2:
            next_page = self.current_page
            current_page_str = "page=" + str(next_page - 1)
            page_url = response.url.replace(current_page_str,
                                            'page={}'.format(next_page))
            yield Request(page_url,
                          self.parse_tweet,
                          dont_filter=True,
                          meta=response.meta,
                          priority=1)
Esempio n. 20
0
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # if page 1, get all page number
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                self.all_page_num = all_page

        current_page = int(response.url.split("page=")[-1])
        print("[INFO] Crawling Tweets Page: " + str(current_page))
        print("[INFO Crawling URL" + response.url)
        """
        解析本页的数据
        """
        selector = Selector(response)
        tweetpage_item = TweetPageItem()
        tweetpage_item['user_id'] = response.meta["user_id"]
        tweetpage_item['page_url'] = response.url.replace(
            self.base_url, self.weibo_baseurl)
        tweetpage_item['page_raw'] = selector.extract()  # get raw page content
        tweetpage_item['crawl_time_utc'] = dt.utcnow()
        yield tweetpage_item

        time_stop_flag = 0  # stop crawling if hit specified start time

        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        if len(tweet_nodes) < 1:
            return

        for tweet_node in tweet_nodes:
            try:
                create_time_info_node = tweet_node.xpath(
                    './/span[@class="ct"]')[-1]
                create_time_info = create_time_info_node.xpath('string(.)')
                print("[INFO] create_time_info raw: " + create_time_info)
                if "来自" in create_time_info:
                    created_at = time_fix(
                        create_time_info.split('来自')[0].strip())
                    time_stop_flag = self.time_flag_compare(
                        created_at)  # time compare to trigger stop flag

                else:
                    created_at = time_fix(create_time_info.strip())
                    time_stop_flag = self.time_flag_compare(
                        created_at)  # time compare to trigger stop flag

                # 检测由没有阅读全文:
                # all_content_link = tweet_node.xpath('.//a[text()="全文" and contains(@href,"ckAll=1")]')
                # if all_content_link:
                #     all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0]
                #     yield Request(all_content_url, callback=self.parse_all_content, meta={'user_id': response.meta["user_id"]},
                #                   priority=1)

                # 抓取该微博的评论信息
                # comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1'
                # yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']},priority=2)

                # Crawl tweet repost
                # repost_url = self.base_url + '/repost/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1'
                # yield Request(url=repost_url, callback=self.parse_repost, meta={'weibo_url': tweet_item['weibo_url']},priority=2)

            except Exception as e:
                self.logger.error(e)

        #  keep looping until hit page with time range limit

        print("[DEBUG] timeflag:" + str(time_stop_flag))
        if time_stop_flag == 0:
            next_page = current_page + 1
            page_url = response.url.replace('page=' + str(current_page),
                                            'page={}'.format(next_page))
            yield Request(page_url,
                          self.parse_tweet,
                          dont_filter=True,
                          meta=response.meta,
                          priority=1)