def parse_tweet(self, response):
        page_url = response.url
        tweet_item = TweetsItem()
        tree_node = etree.HTML(response.body)
        tweet_content_node = tree_node.xpath('.//span[@class="ctt"]')[0]
        all_content = tweet_content_node.xpath('string(.)').strip('\u200b')
        tweet_item['content'] = all_content
        tweet_item['crawl_time'] = int(time.time())

        user_tweet_id = re.search(r'https://weibo.cn/(\d+)/(.*)', page_url)
        tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
            user_tweet_id.group(1), user_tweet_id.group(2))
        tweet_item['user_id'] = user_tweet_id.group(1)
        tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                           user_tweet_id.group(1))
        create_time_info = tree_node.xpath(
            './/span[@class="ct" and contains(text(),"来自")]/text()')[0]
        tweet_item['created_at'] = time_fix(
            create_time_info.split('来自')[0].strip())
        like_num = tree_node.xpath('.//a[contains(text(),"赞[")]/text()')[0]
        tweet_item['like_num'] = int(re.search('\d+', like_num).group())
        repost_num = tree_node.xpath('.//a[contains(text(),"转发[")]/text()')[0]
        tweet_item['repost_num'] = int(re.search('\d+', repost_num).group())
        comment_num = tree_node.xpath(
            './/span[@class="pms" and contains(text(),"评论[")]/text()')[0]
        tweet_item['comment_num'] = int(re.search('\d+', comment_num).group())
        yield tweet_item
        comment_url = page_url + '?page=1'
        yield Request(url=comment_url,
                      callback=self.parse_comment,
                      meta={'weibo_url': page_url})
    def parse_tweets(self, response):
        """ 抓取微博数据 """
        selector = Selector(response)
        ID = re.findall('(\d+)/profile', response.url)[0]
        divs = selector.xpath('body/div[@class="c" and @id]')
        for div in divs:
            try:
                tweetsItems = TweetsItem()
                id = div.xpath('@id').extract_first()  # 微博ID

                real_id = id.split('_')[1]

                content = div.xpath(
                    'div/span[@class="ctt"]//text()').extract()  # 微博内容
                comment = re.findall('评论\[(\d+)\]', div.extract())  # 评论数
                tweetsItems["Comment"] = 0
                tweetsItems["_id"] = ID + "-" + id
                tweetsItems["ID"] = ID
                if content:
                    tweetsItems["Content"] = " ".join(content).strip(
                        '[位置]')  # 去掉最后的"[位置]"
                if comment:
                    tweetsItems["Comment"] = int(comment[0])
                yield tweetsItems

                #enter a tweet
                #https://weibo.cn/comment/ + real_id
                if int(comment[0]) > 0:
                    yield Request(url="https://weibo.cn/comment/%s" % real_id,
                                  callback=self.parse_comment)

            except Exception as e:
                self.logger.info(e)
                pass
Exemple #3
0
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath('.//a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2),
                                                                           user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1))
                create_time_info = tweet_node.xpath('.//span[@class="ct" and contains(text(),"来自")]/text()')[0]
                tweet_item['created_at'] = time_fix(create_time_info.split('来自')[0].strip())

                like_num = tweet_node.xpath('.//a[contains(text(),"赞[")]/text()')[0]
                tweet_item['like_num'] = int(re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath('.//a[contains(text(),"转发[")]/text()')[0]
                tweet_item['repost_num'] = int(re.search('\d+', repost_num).group())

                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()')[0]
                tweet_item['comment_num'] = int(re.search('\d+', comment_num).group())

                tweet_content_node = tweet_node.xpath('.//span[@class="ctt"]')[0]

                # 检测由没有阅读全文:
                all_content_link = tweet_content_node.xpath('.//a[text()="全文"]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0]
                    yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item},
                                  priority=1)

                else:
                    all_content = tweet_content_node.xpath('string(.)').strip('\u200b')
                    tweet_item['content'] = all_content
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)
Exemple #4
0
    def parse_tweets(self, response):
        """ 抓取微博数据 """
        selector = Selector(response)
        ID = re.findall('(\d+)/profile', response.url)[0]
        divs = selector.xpath('body/div[@class="c" and @id]')
        for div in divs:
            try:
                tweetsItems = TweetsItem()
                id = div.xpath('@id').extract_first()  # 微博ID
                content = div.xpath(
                    'div/span[@class="ctt"]//text()').extract()  # 微博内容
                cooridinates = div.xpath('div/a/@href').extract()  # 定位坐标
                like = re.findall('赞\[(\d+)\]', div.extract())  # 点赞数
                transfer = re.findall('转发\[(\d+)\]', div.extract())  # 转载数
                comment = re.findall('评论\[(\d+)\]', div.extract())  # 评论数
                others = div.xpath('div/span[@class="ct"]/text()').extract(
                )  # 求时间和使用工具(手机或平台)

                tweetsItems["_id"] = ID + "-" + id
                tweetsItems["ID"] = ID
                if content:
                    tweetsItems["Content"] = " ".join(content).strip(
                        '[位置]')  # 去掉最后的"[位置]"
                if cooridinates:
                    cooridinates = re.findall('center=([\d.,]+)',
                                              cooridinates[0])
                    if cooridinates:
                        tweetsItems["Co_oridinates"] = cooridinates[0]
                if like:
                    tweetsItems["Like"] = int(like[0])
                if transfer:
                    tweetsItems["Transfer"] = int(transfer[0])
                if comment:
                    tweetsItems["Comment"] = int(comment[0])
                if others:
                    others = others[0].split('来自')
                    tweetsItems["PubTime"] = others[0].replace(u"\xa0", "")
                    if len(others) == 2:
                        tweetsItems["Tools"] = others[1].replace(u"\xa0", "")
                yield tweetsItems
            except Exception as e:
                self.logger.info(e)
                pass

        url_next = selector.xpath(
            'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href'
        ).extract()
        if url_next:
            yield Request(url=self.host + url_next[0],
                          callback=self.parse_tweets,
                          dont_filter=True)
Exemple #5
0
    def parseTweets(self, response):
        if len(response.body) > 50:
            print "###########################"
            print "Fetch Tweets Success"
            print "###########################"

            tweets = json.loads(response.body)
            ID = response.meta["ID"]
            page = ''
            containerid = ''
            if tweets.get("cards", ""):
                cards = tweets["cards"]
                if tweets["cardlistInfo"].get("page", ""):
                    page = tweets["cardlistInfo"]["page"]
                    page = str(page)
                else:
                    return
                if tweets["cardlistInfo"].get("containerid", ""):
                    containerid = tweets["cardlistInfo"]["containerid"]
                for card in cards:
                    mblog = card.get('mblog', '')
                    if mblog:
                        tweetsItems = TweetsItem()
                        tweetsItems["_id"] = card["itemid"]
                        tweetsItems["ID"] = ID
                        tweetsItems["Content"] = json.dumps(mblog)
                        tweetsItems["PubTime"] = mblog["created_at"]
                        tweetsItems["Like"] = mblog["attitudes_count"]
                        tweetsItems["Comment"] = mblog["comments_count"]
                        tweetsItems["Transfer"] = mblog["reposts_count"]
                    yield tweetsItems
                print "###########################"
                print "Tweetspage: " + page
                print "###########################"
                url_tweets = "https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s&page=%s" % (
                    ID, containerid, page)
                yield Request(url=url_tweets,
                              meta={"ID": ID},
                              callback=self.parseTweets,
                              dont_filter=True)
            else:
                return
        else:
            print "###########################"
            print "Fetch Tweets Finish"
            print "###########################"
            return
Exemple #6
0
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_tweet,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info_node = tweet_node.xpath(
                    './/span[@class="ct"]')[-1]
                create_time_info = create_time_info_node.xpath('string(.)')
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                    tweet_item['tool'] = create_time_info.split(
                        '来自')[1].strip()
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())

                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())

                images = tweet_node.xpath('.//img[@alt="图片"]/@src')
                if images:
                    tweet_item['image_url'] = images[0]

                videos = tweet_node.xpath(
                    './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href'
                )
                if videos:
                    tweet_item['video_url'] = videos[0]

                map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]')
                if map_node:
                    map_node = map_node[0]
                    map_node_url = map_node.xpath('./@href')[0]
                    map_info = re.search(r'xy=(.*?)&', map_node_url).group(1)
                    tweet_item['location_map_info'] = map_info

                repost_node = tweet_node.xpath(
                    './/a[contains(text(),"原文评论[")]/@href')
                if repost_node:
                    tweet_item['origin_weibo'] = repost_node[0]

                # 检测由没有阅读全文:
                all_content_link = tweet_node.xpath(
                    './/a[text()="全文" and contains(@href,"ckAll=1")]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    tweet_html = etree.tostring(tweet_node, encoding='unicode')
                    tweet_item['content'] = extract_weibo_content(tweet_html)
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url,
                              callback=self.parse_comment,
                              meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)
Exemple #7
0
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # if page 1, get all page number
            self.current_page = 1
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                self.all_page_num = all_page
        print("[INFO] Crawling Tweets Page: " + str(self.current_page))
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time_utc'] = dt.utcnow(
                )  # insert datetime timestamp utc
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                # if tweet_item['user_id']:
                #     print("[DEBUG] user_id:" + str(tweet_item['user_id']))
                # else:
                #     print("[DEBUG] user_id ERROR")

                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info_node = tweet_node.xpath(
                    './/span[@class="ct"]')[-1]
                create_time_info = create_time_info_node.xpath('string(.)')
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                    self.time_stop_flag = self.time_flag_compare(
                        tweet_item['created_at']
                    )  # time compare to trigger stop flag
                    tweet_item['tool'] = create_time_info.split(
                        '来自')[1].strip()
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())
                    self.time_stop_flag = self.time_flag_compare(
                        tweet_item['created_at']
                    )  # time compare to trigger stop flag
                    tweet_item['tool'] = ""

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())
                #print("[DEBUG] like_num:" + str(tweet_item['like_num']))
                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())
                #print("[DEBUG] repost_num:" + str(tweet_item['repost_num']))
                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())
                #print("[DEBUG] comment_num:" + str(tweet_item['comment_num']))
                # Add to grab all images 1) test if multi images link exists 2) if not use the
                multi_img_link = tweet_node.xpath(
                    './/a[contains(text(),"组图")]/@href')
                if multi_img_link:
                    #print("[DEBUG] multi_img_link:" + multi_img_link[-1])
                    tweet_item['multi_imgs'] = True
                    yield Request(url=multi_img_link[-1],
                                  callback=self.parse_multi_images,
                                  meta={'_id': tweet_item['_id']},
                                  priority=1)
                else:
                    tweet_item['multi_imgs'] = False

                images = tweet_node.xpath('.//img[@alt="图片"]/@src')
                if images:
                    tweet_item['image_url'] = images[0]
                else:
                    tweet_item['image_url'] = "NA"

                videos = tweet_node.xpath(
                    './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href'
                )
                if videos:
                    tweet_item['video_url'] = videos[0]
                else:
                    tweet_item['video_url'] = "NA"

                map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]')
                if map_node:
                    map_node = map_node[0]
                    map_node_url = map_node.xpath('./@href')[0]
                    map_info = re.search(r'xy=(.*?)&', map_node_url).group(1)
                    tweet_item['location_map_info'] = map_info
                else:
                    tweet_item['location_map_info'] = "NA"

                repost_node = tweet_node.xpath(
                    './/a[contains(text(),"原文评论[")]/@href')
                if repost_node:
                    tweet_item['retweet'] = True
                    tweet_item['origin_weibo'] = repost_node[0]
                    # crawl original weibo
                    # origin_weibo_url = self.base_url + '/repost/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1'
                    # yield Request(url=repost_url, callback=self.parse_repost, meta={'weibo_url': tweet_item['weibo_url']},priority = 2)

                else:
                    tweet_item['retweet'] = False
                    tweet_item['origin_weibo'] = "NA"
                # 检测由没有阅读全文:
                all_content_link = tweet_node.xpath(
                    './/a[text()="全文" and contains(@href,"ckAll=1")]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    tweet_html = etree.tostring(tweet_node, encoding='unicode')
                    tweet_item['content'] = extract_weibo_content(tweet_html)
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url,
                              callback=self.parse_comment,
                              meta={'weibo_url': tweet_item['weibo_url']},
                              priority=2)

                # Crawl tweet repost
                repost_url = self.base_url + '/repost/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=repost_url,
                              callback=self.parse_repost,
                              meta={'weibo_url': tweet_item['weibo_url']},
                              priority=2)

            except Exception as e:
                self.logger.error(e)

        #  keep looping until hit page with time range limit
        self.current_page = self.current_page + 1
        if self.time_stop_flag == 0 and self.current_page < (
                self.all_page_num + 1) and self.current_page >= 2:
            next_page = self.current_page
            current_page_str = "page=" + str(next_page - 1)
            page_url = response.url.replace(current_page_str,
                                            'page={}'.format(next_page))
            yield Request(page_url,
                          self.parse_tweet,
                          dont_filter=True,
                          meta=response.meta,
                          priority=1)
Exemple #8
0
    def parse_tweets(self, response):
        """
        functions:
           1. catch each tweet
           2. request next page if existed
        """
        ID = re.findall('(\d+)/profile', response.url)[0]
        divs = response.xpath('body/div[@class="c" and @id]')
        for div in divs:
            try:
                tweetsItems = TweetsItem()
                # _id and ID
                id = div.xpath('@id').extract_first()
                tweetsItems["_id"] = ID + "-" + id
                tweetsItems["ID"] = ID
                # content
                if div.xpath('div/span[@class="ctt"]//text()').extract():
                    content = div.xpath(
                        'div/span[@class="ctt"]//text()').extract()
                    content = " ".join(content).strip('[位置]').strip()
                    # parse content
                    tweetsItems["Content"] = content.replace(
                        u"\u200b", "").replace(u"\xa0 全文", "")
                # coordinates
                if div.xpath('div/a/@href').extract():
                    cooridinates = div.xpath('div/a/@href').extract()
                    cooridinates = re.findall('center=([\d.,]+)',
                                              cooridinates[0])
                    if cooridinates:
                        tweetsItems["Co_oridinates"] = cooridinates[0]
                # like
                if re.findall('赞\[(\d+)\]', div.extract()):
                    like = re.findall('赞\[(\d+)\]', div.extract())
                    tweetsItems["Like"] = int(like[0])
                # transfer
                if re.findall('转发\[(\d+)\]', div.extract()):
                    transfer = re.findall('转发\[(\d+)\]', div.extract())
                    tweetsItems["Transfer"] = int(transfer[0])
                # comment:
                if re.findall('评论\[(\d+)\]', div.extract()):
                    comment = re.findall('评论\[(\d+)\]', div.extract())
                    tweetsItems["Comment"] = int(comment[0])
                # date and equipments/platform
                if div.xpath('div/span[@class="ct"]/text()').extract():
                    others = div.xpath(
                        'div/span[@class="ct"]/text()').extract()
                    others = others[0].split('来自')
                    tweetsItems["PubTime"] = others[0].replace(u"\xa0", "")
                    if len(others) == 2:
                        tweetsItems["Tools"] = others[1].replace(u"\xa0", "")

                print(tweetsItems)
                yield tweetsItems
            except Exception as e:
                self.logger.info(e)
                pass
        # request next page
        next_url = "https://weibo.cn" + response.xpath(
            "//div[@class='pa']/form/div/a[1]/@href").extract()
        if next_url:
            yield Request(url=next_url[0],
                          callback=self.parse_tweets,
                          dont_filter=True)
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_tweet,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        # 总爬虫数加一
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                self.total_scrap_num += 1
                tweet_item['dataset_id'] = self.dataset_id
                tweet_item['blogger_id'] = self.blogger_id
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info = tweet_node.xpath(
                    './/span[@class="ct"]/text()')[-1]
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                # 设置爬虫终点,最多爬几天前的微博, 最多爬多少条
                time_now = datetime.datetime.now()
                created_time = datetime.datetime.strptime(
                    tweet_item['created_at'], "%Y-%m-%d %H:%M")
                if (
                        time_now - created_time
                ).days > MAX_INTERVAL & self.total_scrap_num > MAX_SCRAP_NUM:
                    # TODO 删除这条Twitter记录
                    mongodb_operation.delete_twitter_rec(
                        weibo_url=tweet_item['weibo_url'],
                        dataset_id=self.dataset_id)
                    return

                # TODO 假如微博已经存在, 则删除过去微博记录以及评论记录
                mongodb_operation.delete_previous_twitter_rec(
                    weibo_url=tweet_item['weibo_url'],
                    current_dataset_id=self.dataset_id)
                mongodb_operation.delete_previous_comment_under_twitter(
                    weibo_url=tweet_item['weibo_url'],
                    current_dataset_id=self.dataset_id)

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())

                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())

                tweet_content_node = tweet_node.xpath(
                    './/span[@class="ctt"]')[0]

                # 检测由没有阅读全文:
                all_content_link = tweet_content_node.xpath(
                    './/a[text()="全文"]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    all_content = tweet_content_node.xpath(
                        'string(.)').replace('\u200b', '').strip()
                    tweet_item['content'] = all_content[0:]
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url,
                              callback=self.parse_comment,
                              meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)
Exemple #10
0
    def parse(self, response):
        """
        解析搜索页面
        :param response:
        :return:
        """
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info = tweet_node.xpath(
                    './/span[@class="ct"]/text()')[-1]
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())
                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())
                tweet_content_node = tweet_node.xpath(
                    './/span[@class="ctt"]')[0]

                # TODO 这里加上获取提问者的user_id的解析
                # 换一下顺序,放到回答者的request后面,不行,这里的asker_name_url并不包含info
                # 通过xpath结合正则表达式提取提问者的user_id,或者直接就是他的个人页面,但是这里获得的是以昵称为url的,跳转之后返回的就是id了
                asker_name_urltxt = tweet_node.xpath(
                    './/a[contains(text(),"@")]/text()')[0]
                asker_name_url = self.base_url + tweet_node.xpath(
                    './/a[contains(text(),"@")]/@href')[0]
                # print('提问者的url', asker_name_url)
                tweet_item['asker_name'] = asker_name_urltxt.split('@')[-1]
                # asker_name_url = self.base_url + asker_name_url
                # print('提问者的昵称', tweet_item['asker_name'])
                response_nickname = requests.get(asker_name_url)
                response_url = response_nickname.url
                if 'weibo.cn/u/' in response_url:
                    nickname_id = response_url.split('weibo.cn/u/')[-1]
                else:
                    nickname_id = response_url.split('uid=')[-1]
                # TODO 这里yield一个提问者的request
                # https://blog.csdn.net/rgc_520_zyl/article/details/78946974
                # header = {,'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
                # asker 表的 _id 应该是tweet _id
                yield Request(
                    url="https://weibo.cn/{}/info".format(nickname_id),
                    callback=self.parse_information,
                    priority=3,
                    meta={'asker_from': tweet_item['weibo_url']})

                # 检测由没有阅读全文:
                all_content_link = tweet_content_node.xpath(
                    './/a[text()="全文"]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    all_content = tweet_content_node.xpath(
                        'string(.)').replace('\u200b', '').strip()
                    tweet_item['content'] = all_content[1:]
                    yield tweet_item

                # yield Request(url="https://weibo.cn/{}/info".format(tweet_item['user_id']),
                #               callback=self.parse_information, priority=2)
                yield Request(url="https://weibo.cn/{}/info".format(
                    tweet_item['user_id']),
                              callback=self.parse_information,
                              priority=2)

                # TODO 检测有无评论,如果有yield一个parse_comment
                if tweet_item['comment_num'] > 0:
                    # 抓取该微博的评论信息
                    comment_url = self.base_url + '/comment/' + tweet_item[
                        'weibo_url'].split('/')[-1] + '?page=1'
                    yield Request(url=comment_url,
                                  callback=self.parse_comment,
                                  meta={'weibo_url': tweet_item['weibo_url']},
                                  priority=5)

            except Exception as e:
                self.logger.error(e)
Exemple #11
0
    def parse1(self, response):
        '''抓取个人信息2'''
        #因为有的字段不存在,而mysql需要提取字段,为保存不出错,先统一默认为空,monogodb不存在这个问题,是因为mongodb直接将dict插入数据库,不需要对每个字段赋值
        informationItems = InformationItem()
        informationItems['NickName'] = ''
        informationItems['Gender'] = ''
        informationItems['City'] = ''
        informationItems['URL'] = ''
        informationItems['Num_Fans'] = ''
        informationItems['Num_Follows'] = ''
        informationItems['Num_Tweets'] = ''
        informationItems['Province'] = ''
        informationItems['Signature'] = ''
        #		informationItems = response.meta["item"]
        selector = Selector(response)
        ID = re.findall('weibo\.cn/(\d+)', response.url)[0]
        text1 = ";".join(
            selector.xpath('body/div[@class="c"]/text()').extract())
        print('text1的数据是:')
        print(text1)
        nickname = re.findall('昵称[:|:](.*?);', text1)  # 昵称
        gender = re.findall('性别[:|:](.*?);', text1)  # 性别
        place = re.findall('地区[:|:](.*?);', text1)  # 地区(包括省份和城市)
        signature = re.findall('简介[:|:](.*?);', text1)  # 个性签名
        birthday = re.findall('生日[:|:](.*?);', text1)  # 生日
        sexorientation = re.findall('性取向[:|:](.*?);', text1)  # 性取向
        marriage = re.findall('感情状况[:|:](.*?);', text1)  # 婚姻状况
        url = re.findall('互联网[:|:](.*?);', text1)  # 首页链接
        print('nieckname和gender的数据是:')
        print(nickname)
        print(gender)
        if nickname:
            informationItems["NickName"] = nickname[0]
        if gender:
            informationItems['Gender'] = gender[0]
        if place:
            place = place[0].split(' ')
            informationItems['Province'] = place[0]
            if len(place) > 1:
                informationItems['City'] = place[1]
        if signature:
            informationItems['Signature'] = signature[0]
        if birthday:
            try:
                birthday = datatime.datetime.strptime(birthday[0], "%Y-%m-%d")
                informationItems["Birthday"] = birthday - datetime.timedelta(
                    hours=8)
            except Exception:
                pass

        if sexorientation:
            if sexorientation[0] == gender[0]:
                informationItems["Sex_Orientation"] = "gay"
            else:
                informationItems["Sex_Orientation"] = "Heterosexual"
        if marriage:
            informationItems['Marriage'] = marriage[0]

        if url:
            informationItems["URL"] = url[0]

        urlothers = "http://weibo.cn/attgroup/opening?uid=%s" % ID
        r = requests.get(urlothers, cookies=response.request.cookies)
        if r.status_code == 200:
            selector = etree.HTML(r.content)
            texts = ';'.join(selector.xpath('//div[@class="tip2"]/a/text()'))
            print('texts的数据是')
            print(texts)
            if texts:
                num_tweets = re.findall('微博\[(\d+)\]', texts)  #微博数
                num_follows = re.findall('关注\[(\d+)\]', texts)  #关注数
                num_fans = re.findall('粉丝\[(\d+)\]', texts)  #粉丝数
                if num_tweets:
                    informationItems['Num_Tweets'] = int(num_tweets[0])
                if num_follows:
                    informationItems['Num_Follows'] = int(num_follows[0])
                if num_fans:
                    informationItems['Num_Fans'] = int(num_fans[0])
        print('informationItems的数据是:')
        print(informationItems)
        yield informationItems

        contents = []
        tweets = TweetsItem()
        tweets['_id'] = ID
        tweets['Content'] = contents

        yield Request(url="https://weibo.cn/%s/profile?filter=1&page=1" % ID,
                      meta={
                          'item': tweets,
                          'contents': contents
                      },
                      callback=self.parse_tweets)
Exemple #12
0
    def parse(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            # />&nbsp:html中的空格占位符
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        # 选取所有的div元素+属性class=c+拥有id属性
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                # todo 转发和评论的url
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                tweet_cmt_url = tweet_node.xpath(
                    './/a[contains(text(),"评论[")]/@href')[0]
                # 发送微博的用户id
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                # _id作为微博的id
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info = tweet_node.xpath(
                    './/span[@class="ct"]/text()')[-1]
                # 去掉时间后面的部分 比如来自新浪微博/来自iphone
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())
                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())
                tweet_content_node = tweet_node.xpath(
                    './/span[@class="ctt"]')[0]

                # 检测有没有阅读全文:
                all_content_link = tweet_content_node.xpath(
                    './/a[text()="全文"]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    all_content = tweet_content_node.xpath(
                        'string(.)').replace('\u200b', '').strip()
                    tweet_item['content'] = all_content[1:]
                    yield tweet_item
                    # todo

                # 爬取评论用户信息和评论内容
                if tweet_item['comment_num'] > 0:
                    yield Request(url=tweet_cmt_url,
                                  callback=self.parse_cmt_info,
                                  meta={'weibo_id': tweet_item['_id']},
                                  priority=3)

                # # 爬取发微博的用户信息
                # yield Request(url="https://weibo.cn/{}/info".format(tweet_item['user_id']),
                #               callback=self.parse_information, priority=1)

                # todo 爬去转发用户的信息
                # todo 爬去转发用户的id和评论,赞数,加上微博的id可以匹配

            except Exception as e:
                self.logger.error(e)
Exemple #13
0
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                if (all_page > self.MAX_WEIBO_PAGES):
                    all_page = self.MAX_WEIBO_PAGES
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_tweet,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                create_time_info = tweet_node.xpath(
                    './/span[@class="ct"]/text()')[-1]
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())
                #时间最低日期
                if (tweet_item['created_at'] < self.MIN_WEIBO_DATE):
                    1
                else:
                    like_num = tweet_node.xpath(
                        './/a[contains(text(),"赞[")]/text()')[-1]
                    tweet_item['like_num'] = int(
                        re.search('\d+', like_num).group())

                    repost_num = tweet_node.xpath(
                        './/a[contains(text(),"转发[")]/text()')[-1]
                    tweet_item['repost_num'] = int(
                        re.search('\d+', repost_num).group())

                    comment_num = tweet_node.xpath(
                        './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                    )[-1]
                    tweet_item['comment_num'] = int(
                        re.search('\d+', comment_num).group())

                    # 检测由没有阅读全文:
                    all_content_link = tweet_node.xpath(
                        './/a[text()="全文" and contains(@href,"ckAll=1")]')
                    if all_content_link:
                        all_content_url = self.base_url + all_content_link[
                            0].xpath('./@href')[0]
                        yield Request(all_content_url,
                                      callback=self.parse_all_content,
                                      meta={'item': tweet_item},
                                      priority=1)

                    else:
                        all_content_text = tweet_node.xpath('string(.)')
                        if '转发理由:' in all_content_text:
                            all_content_text = all_content_text.split(
                                '转发理由:')[1]
                        all_content_text = all_content_text.split(
                            '\xa0', maxsplit=1)[0]
                        tweet_item['content'] = all_content_text.strip()
                        try:
                            s = SnowNLP(tweet_item['content'])
                            tweet_item['sentiments'] = str(s.sentiments *
                                                           10)[0:8]
                        except:
                            tweet_item['sentiments'] = '5.0'

                        try:
                            sql = "INSERT INTO `sbhdb`.`weibo_info`( `weibo_url`, `user_id`, `content`, `created_at`, `repost_num`, `comment_num`, `like_num`, `crawl_time`, `sentiments`) VALUES ('%s', '%s', '%s', '%s', %s,%s, %s, %s,%s)" % (
                                tweet_item['weibo_url'], tweet_item['user_id'],
                                tweet_item['content'],
                                tweet_item['created_at'],
                                tweet_item['repost_num'],
                                tweet_item['comment_num'],
                                tweet_item['like_num'],
                                tweet_item['crawl_time'],
                                tweet_item['sentiments'])
                            self.cursor.execute(sql)
                            self.db.commit()
                        except:
                            # 数据有重复
                            continue
                            pass
                    yield tweet_item

                    # 抓取该微博的评论信息
                    comment_url = self.base_url + '/comment/' + tweet_item[
                        'weibo_url'].split('/')[-1] + '?page=1'
                    yield Request(url=comment_url,
                                  callback=self.parse_comment,
                                  meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)
Exemple #14
0
 def parseTweets(self, response):
     if len(response.body) > 50:
         print "###########################"
         print "Fetch Tweets Success"
         print "###########################"
         ori_ID = response.meta['ori_id']
         tweets = json.loads(response.body)
         ID = response.meta["ID"]
         Owner = response.meta["owner"]
         page = ''
         containerid = ''
         if tweets.get("cards", ""):
             cards = tweets["cards"]
             if tweets["cardlistInfo"].get("page", ""):
                 page = tweets["cardlistInfo"]["page"]
                 page = str(page)
             else:
                 return
             # if tweets["cardlistInfo"].get("containerid", ""):
             #     containerid = tweets["cardlistInfo"]["containerid"]
             for card in cards:
                 mblog = card.get('mblog', '')
                 if mblog:
                     tweetsItems = TweetsItem()
                     tweetsItems["_id"] = mblog["id"]
                     tweetsItems["ID"] = ID
                     tweetsItems["Owner"] = Owner
                     tweetsItems["Used"] = False
                     tweetsItems['LocalImgs'] = []
                     tweetsItems["Content"] = json.dumps(mblog).decode(
                         'unicode-escape')
                     tweetsItems["PubTime"] = mblog["created_at"]
                     tweetsItems["Like"] = mblog["attitudes_count"]
                     tweetsItems["Comment"] = mblog["comments_count"]
                     tweetsItems["Transfer"] = mblog["reposts_count"]
                     tweetsItems["TweetsText"] = mblog["text"]
                     pics = mblog.get('pics', '')
                     if pics:
                         img_urls = []
                         small_img_urls = []
                         # print mblog["pics"]
                         for pic in pics:
                             url = pic["large"]['url']
                             surl = pic['url']
                             # print url
                             img_urls.append(url)
                             small_img_urls.append(surl)
                         tweetsItems["Imgs"] = img_urls
                         tweetsItems['SmallImgs'] = small_img_urls
                     else:
                         tweetsItems["Imgs"] = []
                         tweetsItems['SmallImgs'] = []
                 yield tweetsItems
             print "###########################"
             print "Tweetspage: " + page
             print "###########################"
             if page >= Tweets_Num:
                 print "###########################"
                 print "Fetch Tweets Finish"
                 print "###########################"
                 return
             # url_tweets = "https://m.weibo.cn/api/container/getIndex?type=uid&value={value}&containerid={ori_id}&page=%s" % (
             # ID, containerid, page)
             ori_url = 'https://m.weibo.cn/api/container/getIndex?containerid={ori_id}_-_WEIBO_SECOND_PROFILE_WEIBO_ORI' \
                       '&type=uid&page_type=03&value={value}&page={page}'.format(
                 ori_id=ori_ID, value=response.meta['ID'],page=page
             )
             yield Request(url=ori_url,
                           meta={"ID": ID},
                           callback=self.parseTweets,
                           dont_filter=True)
         else:
             return
     else:
         print "###########################"
         print "Fetch Tweets Finish"
         print "###########################"
         return