Python TweetsItem Examples

Programming Language: Python

Namespace/Package Name: sina.items

Class/Type: TweetsItem

Examples at hotexamples.com: 14

Python TweetsItem - 14 examples found. These are the top rated real world Python examples of sina.items.TweetsItem extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

TweetsItem(14)

Frequently Used Methods

TweetsItem (14)

Example #1

Show file

File: single_weibo.py Project: PFISH1998/WeiboSpider

    def parse_tweet(self, response):
        page_url = response.url
        tweet_item = TweetsItem()
        tree_node = etree.HTML(response.body)
        tweet_content_node = tree_node.xpath('.//span[@class="ctt"]')[0]
        all_content = tweet_content_node.xpath('string(.)').strip('\u200b')
        tweet_item['content'] = all_content
        tweet_item['crawl_time'] = int(time.time())

        user_tweet_id = re.search(r'https://weibo.cn/(\d+)/(.*)', page_url)
        tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
            user_tweet_id.group(1), user_tweet_id.group(2))
        tweet_item['user_id'] = user_tweet_id.group(1)
        tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                           user_tweet_id.group(1))
        create_time_info = tree_node.xpath(
            './/span[@class="ct" and contains(text(),"来自")]/text()')[0]
        tweet_item['created_at'] = time_fix(
            create_time_info.split('来自')[0].strip())
        like_num = tree_node.xpath('.//a[contains(text(),"赞[")]/text()')[0]
        tweet_item['like_num'] = int(re.search('\d+', like_num).group())
        repost_num = tree_node.xpath('.//a[contains(text(),"转发[")]/text()')[0]
        tweet_item['repost_num'] = int(re.search('\d+', repost_num).group())
        comment_num = tree_node.xpath(
            './/span[@class="pms" and contains(text(),"评论[")]/text()')[0]
        tweet_item['comment_num'] = int(re.search('\d+', comment_num).group())
        yield tweet_item
        comment_url = page_url + '?page=1'
        yield Request(url=comment_url,
                      callback=self.parse_comment,
                      meta={'weibo_url': page_url})

Example #2

Show file

File: weibo_spider.py Project: koumingyang/SinaScrapy

    def parse_tweets(self, response):
        """ 抓取微博数据 """
        selector = Selector(response)
        ID = re.findall('(\d+)/profile', response.url)[0]
        divs = selector.xpath('body/div[@class="c" and @id]')
        for div in divs:
            try:
                tweetsItems = TweetsItem()
                id = div.xpath('@id').extract_first()  # 微博ID

                real_id = id.split('_')[1]

                content = div.xpath(
                    'div/span[@class="ctt"]//text()').extract()  # 微博内容
                comment = re.findall('评论\[(\d+)\]', div.extract())  # 评论数
                tweetsItems["Comment"] = 0
                tweetsItems["_id"] = ID + "-" + id
                tweetsItems["ID"] = ID
                if content:
                    tweetsItems["Content"] = " ".join(content).strip(
                        '[位置]')  # 去掉最后的"[位置]"
                if comment:
                    tweetsItems["Comment"] = int(comment[0])
                yield tweetsItems

                #enter a tweet
                #https://weibo.cn/comment/ + real_id
                if int(comment[0]) > 0:
                    yield Request(url="https://weibo.cn/comment/%s" % real_id,
                                  callback=self.parse_comment)

            except Exception as e:
                self.logger.info(e)
                pass

Example #3

Show file

    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页，一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath('.//a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2),
                                                                           user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1))
                create_time_info = tweet_node.xpath('.//span[@class="ct" and contains(text(),"来自")]/text()')[0]
                tweet_item['created_at'] = time_fix(create_time_info.split('来自')[0].strip())

                like_num = tweet_node.xpath('.//a[contains(text(),"赞[")]/text()')[0]
                tweet_item['like_num'] = int(re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath('.//a[contains(text(),"转发[")]/text()')[0]
                tweet_item['repost_num'] = int(re.search('\d+', repost_num).group())

                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()')[0]
                tweet_item['comment_num'] = int(re.search('\d+', comment_num).group())

                tweet_content_node = tweet_node.xpath('.//span[@class="ctt"]')[0]

                # 检测由没有阅读全文:
                all_content_link = tweet_content_node.xpath('.//a[text()="全文"]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0]
                    yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item},
                                  priority=1)

                else:
                    all_content = tweet_content_node.xpath('string(.)').strip('\u200b')
                    tweet_item['content'] = all_content
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)

Example #4

Show file

    def parse_tweets(self, response):
        """ 抓取微博数据 """
        selector = Selector(response)
        ID = re.findall('(\d+)/profile', response.url)[0]
        divs = selector.xpath('body/div[@class="c" and @id]')
        for div in divs:
            try:
                tweetsItems = TweetsItem()
                id = div.xpath('@id').extract_first()  # 微博ID
                content = div.xpath(
                    'div/span[@class="ctt"]//text()').extract()  # 微博内容
                cooridinates = div.xpath('div/a/@href').extract()  # 定位坐标
                like = re.findall('赞\[(\d+)\]', div.extract())  # 点赞数
                transfer = re.findall('转发\[(\d+)\]', div.extract())  # 转载数
                comment = re.findall('评论\[(\d+)\]', div.extract())  # 评论数
                others = div.xpath('div/span[@class="ct"]/text()').extract(
                )  # 求时间和使用工具（手机或平台）

                tweetsItems["_id"] = ID + "-" + id
                tweetsItems["ID"] = ID
                if content:
                    tweetsItems["Content"] = " ".join(content).strip(
                        '[位置]')  # 去掉最后的"[位置]"
                if cooridinates:
                    cooridinates = re.findall('center=([\d.,]+)',
                                              cooridinates[0])
                    if cooridinates:
                        tweetsItems["Co_oridinates"] = cooridinates[0]
                if like:
                    tweetsItems["Like"] = int(like[0])
                if transfer:
                    tweetsItems["Transfer"] = int(transfer[0])
                if comment:
                    tweetsItems["Comment"] = int(comment[0])
                if others:
                    others = others[0].split('来自')
                    tweetsItems["PubTime"] = others[0].replace(u"\xa0", "")
                    if len(others) == 2:
                        tweetsItems["Tools"] = others[1].replace(u"\xa0", "")
                yield tweetsItems
            except Exception as e:
                self.logger.info(e)
                pass

        url_next = selector.xpath(
            'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href'
        ).extract()
        if url_next:
            yield Request(url=self.host + url_next[0],
                          callback=self.parse_tweets,
                          dont_filter=True)

Example #5

Show file

File: sinaSpider.py Project: ningg/sinaSpider-1

    def parseTweets(self, response):
        if len(response.body) > 50:
            print "###########################"
            print "Fetch Tweets Success"
            print "###########################"

            tweets = json.loads(response.body)
            ID = response.meta["ID"]
            page = ''
            containerid = ''
            if tweets.get("cards", ""):
                cards = tweets["cards"]
                if tweets["cardlistInfo"].get("page", ""):
                    page = tweets["cardlistInfo"]["page"]
                    page = str(page)
                else:
                    return
                if tweets["cardlistInfo"].get("containerid", ""):
                    containerid = tweets["cardlistInfo"]["containerid"]
                for card in cards:
                    mblog = card.get('mblog', '')
                    if mblog:
                        tweetsItems = TweetsItem()
                        tweetsItems["_id"] = card["itemid"]
                        tweetsItems["ID"] = ID
                        tweetsItems["Content"] = json.dumps(mblog)
                        tweetsItems["PubTime"] = mblog["created_at"]
                        tweetsItems["Like"] = mblog["attitudes_count"]
                        tweetsItems["Comment"] = mblog["comments_count"]
                        tweetsItems["Transfer"] = mblog["reposts_count"]
                    yield tweetsItems
                print "###########################"
                print "Tweetspage: " + page
                print "###########################"
                url_tweets = "https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s&page=%s" % (
                    ID, containerid, page)
                yield Request(url=url_tweets,
                              meta={"ID": ID},
                              callback=self.parseTweets,
                              dont_filter=True)
            else:
                return
        else:
            print "###########################"
            print "Fetch Tweets Finish"
            print "###########################"
            return

Example #6

Show file

    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页，一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_tweet,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info_node = tweet_node.xpath(
                    './/span[@class="ct"]')[-1]
                create_time_info = create_time_info_node.xpath('string(.)')
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                    tweet_item['tool'] = create_time_info.split(
                        '来自')[1].strip()
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())

                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())

                images = tweet_node.xpath('.//img[@alt="图片"]/@src')
                if images:
                    tweet_item['image_url'] = images[0]

                videos = tweet_node.xpath(
                    './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href'
                )
                if videos:
                    tweet_item['video_url'] = videos[0]

                map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]')
                if map_node:
                    map_node = map_node[0]
                    map_node_url = map_node.xpath('./@href')[0]
                    map_info = re.search(r'xy=(.*?)&', map_node_url).group(1)
                    tweet_item['location_map_info'] = map_info

                repost_node = tweet_node.xpath(
                    './/a[contains(text(),"原文评论[")]/@href')
                if repost_node:
                    tweet_item['origin_weibo'] = repost_node[0]

                # 检测由没有阅读全文:
                all_content_link = tweet_node.xpath(
                    './/a[text()="全文" and contains(@href,"ckAll=1")]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    tweet_html = etree.tostring(tweet_node, encoding='unicode')
                    tweet_item['content'] = extract_weibo_content(tweet_html)
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url,
                              callback=self.parse_comment,
                              meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)

Example #7

Show file

    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # if page 1, get all page number
            self.current_page = 1
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                self.all_page_num = all_page
        print("[INFO] Crawling Tweets Page: " + str(self.current_page))
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time_utc'] = dt.utcnow(
                )  # insert datetime timestamp utc
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                # if tweet_item['user_id']:
                #     print("[DEBUG] user_id:" + str(tweet_item['user_id']))
                # else:
                #     print("[DEBUG] user_id ERROR")

                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info_node = tweet_node.xpath(
                    './/span[@class="ct"]')[-1]
                create_time_info = create_time_info_node.xpath('string(.)')
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                    self.time_stop_flag = self.time_flag_compare(
                        tweet_item['created_at']
                    )  # time compare to trigger stop flag
                    tweet_item['tool'] = create_time_info.split(
                        '来自')[1].strip()
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())
                    self.time_stop_flag = self.time_flag_compare(
                        tweet_item['created_at']
                    )  # time compare to trigger stop flag
                    tweet_item['tool'] = ""

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())
                #print("[DEBUG] like_num:" + str(tweet_item['like_num']))
                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())
                #print("[DEBUG] repost_num:" + str(tweet_item['repost_num']))
                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())
                #print("[DEBUG] comment_num:" + str(tweet_item['comment_num']))
                # Add to grab all images 1) test if multi images link exists 2) if not use the
                multi_img_link = tweet_node.xpath(
                    './/a[contains(text(),"组图")]/@href')
                if multi_img_link:
                    #print("[DEBUG] multi_img_link:" + multi_img_link[-1])
                    tweet_item['multi_imgs'] = True
                    yield Request(url=multi_img_link[-1],
                                  callback=self.parse_multi_images,
                                  meta={'_id': tweet_item['_id']},
                                  priority=1)
                else:
                    tweet_item['multi_imgs'] = False

                images = tweet_node.xpath('.//img[@alt="图片"]/@src')
                if images:
                    tweet_item['image_url'] = images[0]
                else:
                    tweet_item['image_url'] = "NA"

                videos = tweet_node.xpath(
                    './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href'
                )
                if videos:
                    tweet_item['video_url'] = videos[0]
                else:
                    tweet_item['video_url'] = "NA"

                map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]')
                if map_node:
                    map_node = map_node[0]
                    map_node_url = map_node.xpath('./@href')[0]
                    map_info = re.search(r'xy=(.*?)&', map_node_url).group(1)
                    tweet_item['location_map_info'] = map_info
                else:
                    tweet_item['location_map_info'] = "NA"

                repost_node = tweet_node.xpath(
                    './/a[contains(text(),"原文评论[")]/@href')
                if repost_node:
                    tweet_item['retweet'] = True
                    tweet_item['origin_weibo'] = repost_node[0]
                    # crawl original weibo
                    # origin_weibo_url = self.base_url + '/repost/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1'
                    # yield Request(url=repost_url, callback=self.parse_repost, meta={'weibo_url': tweet_item['weibo_url']},priority = 2)

                else:
                    tweet_item['retweet'] = False
                    tweet_item['origin_weibo'] = "NA"
                # 检测由没有阅读全文:
                all_content_link = tweet_node.xpath(
                    './/a[text()="全文" and contains(@href,"ckAll=1")]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    tweet_html = etree.tostring(tweet_node, encoding='unicode')
                    tweet_item['content'] = extract_weibo_content(tweet_html)
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url,
                              callback=self.parse_comment,
                              meta={'weibo_url': tweet_item['weibo_url']},
                              priority=2)

                # Crawl tweet repost
                repost_url = self.base_url + '/repost/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=repost_url,
                              callback=self.parse_repost,
                              meta={'weibo_url': tweet_item['weibo_url']},
                              priority=2)

            except Exception as e:
                self.logger.error(e)

        #  keep looping until hit page with time range limit
        self.current_page = self.current_page + 1
        if self.time_stop_flag == 0 and self.current_page < (
                self.all_page_num + 1) and self.current_page >= 2:
            next_page = self.current_page
            current_page_str = "page=" + str(next_page - 1)
            page_url = response.url.replace(current_page_str,
                                            'page={}'.format(next_page))
            yield Request(page_url,
                          self.parse_tweet,
                          dont_filter=True,
                          meta=response.meta,
                          priority=1)

Example #8

Show file

File: weibo_spider.py Project: ZoomCypher/weibo

    def parse_tweets(self, response):
        """
        functions:
           1. catch each tweet
           2. request next page if existed
        """
        ID = re.findall('(\d+)/profile', response.url)[0]
        divs = response.xpath('body/div[@class="c" and @id]')
        for div in divs:
            try:
                tweetsItems = TweetsItem()
                # _id and ID
                id = div.xpath('@id').extract_first()
                tweetsItems["_id"] = ID + "-" + id
                tweetsItems["ID"] = ID
                # content
                if div.xpath('div/span[@class="ctt"]//text()').extract():
                    content = div.xpath(
                        'div/span[@class="ctt"]//text()').extract()
                    content = " ".join(content).strip('[位置]').strip()
                    # parse content
                    tweetsItems["Content"] = content.replace(
                        u"\u200b", "").replace(u"\xa0 全文", "")
                # coordinates
                if div.xpath('div/a/@href').extract():
                    cooridinates = div.xpath('div/a/@href').extract()
                    cooridinates = re.findall('center=([\d.,]+)',
                                              cooridinates[0])
                    if cooridinates:
                        tweetsItems["Co_oridinates"] = cooridinates[0]
                # like
                if re.findall('赞\[(\d+)\]', div.extract()):
                    like = re.findall('赞\[(\d+)\]', div.extract())
                    tweetsItems["Like"] = int(like[0])
                # transfer
                if re.findall('转发\[(\d+)\]', div.extract()):
                    transfer = re.findall('转发\[(\d+)\]', div.extract())
                    tweetsItems["Transfer"] = int(transfer[0])
                # comment:
                if re.findall('评论\[(\d+)\]', div.extract()):
                    comment = re.findall('评论\[(\d+)\]', div.extract())
                    tweetsItems["Comment"] = int(comment[0])
                # date and equipments/platform
                if div.xpath('div/span[@class="ct"]/text()').extract():
                    others = div.xpath(
                        'div/span[@class="ct"]/text()').extract()
                    others = others[0].split('来自')
                    tweetsItems["PubTime"] = others[0].replace(u"\xa0", "")
                    if len(others) == 2:
                        tweetsItems["Tools"] = others[1].replace(u"\xa0", "")

                print(tweetsItems)
                yield tweetsItems
            except Exception as e:
                self.logger.info(e)
                pass
        # request next page
        next_url = "https://weibo.cn" + response.xpath(
            "//div[@class='pa']/form/div/a[1]/@href").extract()
        if next_url:
            yield Request(url=next_url[0],
                          callback=self.parse_tweets,
                          dont_filter=True)

Example #9

Show file

File: weibo_spider.py Project: MarvelousDick/WeiboSpiderSimple

    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页，一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_tweet,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        # 总爬虫数加一
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                self.total_scrap_num += 1
                tweet_item['dataset_id'] = self.dataset_id
                tweet_item['blogger_id'] = self.blogger_id
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info = tweet_node.xpath(
                    './/span[@class="ct"]/text()')[-1]
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                # 设置爬虫终点,最多爬几天前的微博, 最多爬多少条
                time_now = datetime.datetime.now()
                created_time = datetime.datetime.strptime(
                    tweet_item['created_at'], "%Y-%m-%d %H:%M")
                if (
                        time_now - created_time
                ).days > MAX_INTERVAL & self.total_scrap_num > MAX_SCRAP_NUM:
                    # TODO 删除这条Twitter记录
                    mongodb_operation.delete_twitter_rec(
                        weibo_url=tweet_item['weibo_url'],
                        dataset_id=self.dataset_id)
                    return

                # TODO 假如微博已经存在, 则删除过去微博记录以及评论记录
                mongodb_operation.delete_previous_twitter_rec(
                    weibo_url=tweet_item['weibo_url'],
                    current_dataset_id=self.dataset_id)
                mongodb_operation.delete_previous_comment_under_twitter(
                    weibo_url=tweet_item['weibo_url'],
                    current_dataset_id=self.dataset_id)

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())

                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())

                tweet_content_node = tweet_node.xpath(
                    './/span[@class="ctt"]')[0]

                # 检测由没有阅读全文:
                all_content_link = tweet_content_node.xpath(
                    './/a[text()="全文"]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    all_content = tweet_content_node.xpath(
                        'string(.)').replace('\u200b', '').strip()
                    tweet_item['content'] = all_content[0:]
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url,
                              callback=self.parse_comment,
                              meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)

Example #10

Show file

    def parse(self, response):
        """
        解析搜索页面
        :param response:
        :return:
        """
        if response.url.endswith('page=1'):
            # 如果是第1页，一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info = tweet_node.xpath(
                    './/span[@class="ct"]/text()')[-1]
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())
                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())
                tweet_content_node = tweet_node.xpath(
                    './/span[@class="ctt"]')[0]

                # TODO 这里加上获取提问者的user_id的解析
                # 换一下顺序，放到回答者的request后面，不行，这里的asker_name_url并不包含info
                # 通过xpath结合正则表达式提取提问者的user_id，或者直接就是他的个人页面,但是这里获得的是以昵称为url的，跳转之后返回的就是id了
                asker_name_urltxt = tweet_node.xpath(
                    './/a[contains(text(),"@")]/text()')[0]
                asker_name_url = self.base_url + tweet_node.xpath(
                    './/a[contains(text(),"@")]/@href')[0]
                # print('提问者的url', asker_name_url)
                tweet_item['asker_name'] = asker_name_urltxt.split('@')[-1]
                # asker_name_url = self.base_url + asker_name_url
                # print('提问者的昵称', tweet_item['asker_name'])
                response_nickname = requests.get(asker_name_url)
                response_url = response_nickname.url
                if 'weibo.cn/u/' in response_url:
                    nickname_id = response_url.split('weibo.cn/u/')[-1]
                else:
                    nickname_id = response_url.split('uid=')[-1]
                # TODO 这里yield一个提问者的request
                # https://blog.csdn.net/rgc_520_zyl/article/details/78946974
                # header = {,'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
                # asker 表的 _id 应该是tweet _id
                yield Request(
                    url="https://weibo.cn/{}/info".format(nickname_id),
                    callback=self.parse_information,
                    priority=3,
                    meta={'asker_from': tweet_item['weibo_url']})

                # 检测由没有阅读全文:
                all_content_link = tweet_content_node.xpath(
                    './/a[text()="全文"]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    all_content = tweet_content_node.xpath(
                        'string(.)').replace('\u200b', '').strip()
                    tweet_item['content'] = all_content[1:]
                    yield tweet_item

                # yield Request(url="https://weibo.cn/{}/info".format(tweet_item['user_id']),
                #               callback=self.parse_information, priority=2)
                yield Request(url="https://weibo.cn/{}/info".format(
                    tweet_item['user_id']),
                              callback=self.parse_information,
                              priority=2)

                # TODO 检测有无评论，如果有yield一个parse_comment
                if tweet_item['comment_num'] > 0:
                    # 抓取该微博的评论信息
                    comment_url = self.base_url + '/comment/' + tweet_item[
                        'weibo_url'].split('/')[-1] + '?page=1'
                    yield Request(url=comment_url,
                                  callback=self.parse_comment,
                                  meta={'weibo_url': tweet_item['weibo_url']},
                                  priority=5)

            except Exception as e:
                self.logger.error(e)

Example #11

Show file

    def parse1(self, response):
        '''抓取个人信息2'''
        #因为有的字段不存在，而mysql需要提取字段，为保存不出错，先统一默认为空，monogodb不存在这个问题，是因为mongodb直接将dict插入数据库，不需要对每个字段赋值
        informationItems = InformationItem()
        informationItems['NickName'] = ''
        informationItems['Gender'] = ''
        informationItems['City'] = ''
        informationItems['URL'] = ''
        informationItems['Num_Fans'] = ''
        informationItems['Num_Follows'] = ''
        informationItems['Num_Tweets'] = ''
        informationItems['Province'] = ''
        informationItems['Signature'] = ''
        #		informationItems = response.meta["item"]
        selector = Selector(response)
        ID = re.findall('weibo\.cn/(\d+)', response.url)[0]
        text1 = ";".join(
            selector.xpath('body/div[@class="c"]/text()').extract())
        print('text1的数据是：')
        print(text1)
        nickname = re.findall('昵称[:|：](.*?);', text1)  # 昵称
        gender = re.findall('性别[:|：](.*?);', text1)  # 性别
        place = re.findall('地区[:|：](.*?);', text1)  # 地区（包括省份和城市）
        signature = re.findall('简介[:|：](.*?);', text1)  # 个性签名
        birthday = re.findall('生日[:|：](.*?);', text1)  # 生日
        sexorientation = re.findall('性取向[:|：](.*?);', text1)  # 性取向
        marriage = re.findall('感情状况[:|：](.*?);', text1)  # 婚姻状况
        url = re.findall('互联网[:|：](.*?);', text1)  # 首页链接
        print('nieckname和gender的数据是：')
        print(nickname)
        print(gender)
        if nickname:
            informationItems["NickName"] = nickname[0]
        if gender:
            informationItems['Gender'] = gender[0]
        if place:
            place = place[0].split(' ')
            informationItems['Province'] = place[0]
            if len(place) > 1:
                informationItems['City'] = place[1]
        if signature:
            informationItems['Signature'] = signature[0]
        if birthday:
            try:
                birthday = datatime.datetime.strptime(birthday[0], "%Y-%m-%d")
                informationItems["Birthday"] = birthday - datetime.timedelta(
                    hours=8)
            except Exception:
                pass

        if sexorientation:
            if sexorientation[0] == gender[0]:
                informationItems["Sex_Orientation"] = "gay"
            else:
                informationItems["Sex_Orientation"] = "Heterosexual"
        if marriage:
            informationItems['Marriage'] = marriage[0]

        if url:
            informationItems["URL"] = url[0]

        urlothers = "http://weibo.cn/attgroup/opening?uid=%s" % ID
        r = requests.get(urlothers, cookies=response.request.cookies)
        if r.status_code == 200:
            selector = etree.HTML(r.content)
            texts = ';'.join(selector.xpath('//div[@class="tip2"]/a/text()'))
            print('texts的数据是')
            print(texts)
            if texts:
                num_tweets = re.findall('微博\[(\d+)\]', texts)  #微博数
                num_follows = re.findall('关注\[(\d+)\]', texts)  #关注数
                num_fans = re.findall('粉丝\[(\d+)\]', texts)  #粉丝数
                if num_tweets:
                    informationItems['Num_Tweets'] = int(num_tweets[0])
                if num_follows:
                    informationItems['Num_Follows'] = int(num_follows[0])
                if num_fans:
                    informationItems['Num_Fans'] = int(num_fans[0])
        print('informationItems的数据是：')
        print(informationItems)
        yield informationItems

        contents = []
        tweets = TweetsItem()
        tweets['_id'] = ID
        tweets['Content'] = contents

        yield Request(url="https://weibo.cn/%s/profile?filter=1&page=1" % ID,
                      meta={
                          'item': tweets,
                          'contents': contents
                      },
                      callback=self.parse_tweets)

Example #12

Show file

    def parse(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页，一次性获取后面的所有页
            # />&nbsp:html中的空格占位符
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        # 选取所有的div元素+属性class=c+拥有id属性
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                # todo 转发和评论的url
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                tweet_cmt_url = tweet_node.xpath(
                    './/a[contains(text(),"评论[")]/@href')[0]
                # 发送微博的用户id
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                # _id作为微博的id
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info = tweet_node.xpath(
                    './/span[@class="ct"]/text()')[-1]
                # 去掉时间后面的部分 比如来自新浪微博/来自iphone
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())
                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())
                tweet_content_node = tweet_node.xpath(
                    './/span[@class="ctt"]')[0]

                # 检测有没有阅读全文:
                all_content_link = tweet_content_node.xpath(
                    './/a[text()="全文"]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    all_content = tweet_content_node.xpath(
                        'string(.)').replace('\u200b', '').strip()
                    tweet_item['content'] = all_content[1:]
                    yield tweet_item
                    # todo

                # 爬取评论用户信息和评论内容
                if tweet_item['comment_num'] > 0:
                    yield Request(url=tweet_cmt_url,
                                  callback=self.parse_cmt_info,
                                  meta={'weibo_id': tweet_item['_id']},
                                  priority=3)

                # # 爬取发微博的用户信息
                # yield Request(url="https://weibo.cn/{}/info".format(tweet_item['user_id']),
                #               callback=self.parse_information, priority=1)

                # todo 爬去转发用户的信息
                # todo 爬去转发用户的id和评论，赞数，加上微博的id可以匹配

            except Exception as e:
                self.logger.error(e)

Example #13

Show file

    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页，一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                if (all_page > self.MAX_WEIBO_PAGES):
                    all_page = self.MAX_WEIBO_PAGES
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_tweet,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                create_time_info = tweet_node.xpath(
                    './/span[@class="ct"]/text()')[-1]
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())
                #时间最低日期
                if (tweet_item['created_at'] < self.MIN_WEIBO_DATE):
                    1
                else:
                    like_num = tweet_node.xpath(
                        './/a[contains(text(),"赞[")]/text()')[-1]
                    tweet_item['like_num'] = int(
                        re.search('\d+', like_num).group())

                    repost_num = tweet_node.xpath(
                        './/a[contains(text(),"转发[")]/text()')[-1]
                    tweet_item['repost_num'] = int(
                        re.search('\d+', repost_num).group())

                    comment_num = tweet_node.xpath(
                        './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                    )[-1]
                    tweet_item['comment_num'] = int(
                        re.search('\d+', comment_num).group())

                    # 检测由没有阅读全文:
                    all_content_link = tweet_node.xpath(
                        './/a[text()="全文" and contains(@href,"ckAll=1")]')
                    if all_content_link:
                        all_content_url = self.base_url + all_content_link[
                            0].xpath('./@href')[0]
                        yield Request(all_content_url,
                                      callback=self.parse_all_content,
                                      meta={'item': tweet_item},
                                      priority=1)

                    else:
                        all_content_text = tweet_node.xpath('string(.)')
                        if '转发理由:' in all_content_text:
                            all_content_text = all_content_text.split(
                                '转发理由:')[1]
                        all_content_text = all_content_text.split(
                            '\xa0', maxsplit=1)[0]
                        tweet_item['content'] = all_content_text.strip()
                        try:
                            s = SnowNLP(tweet_item['content'])
                            tweet_item['sentiments'] = str(s.sentiments *
                                                           10)[0:8]
                        except:
                            tweet_item['sentiments'] = '5.0'

                        try:
                            sql = "INSERT INTO `sbhdb`.`weibo_info`( `weibo_url`, `user_id`, `content`, `created_at`, `repost_num`, `comment_num`, `like_num`, `crawl_time`, `sentiments`) VALUES ('%s', '%s', '%s', '%s', %s,%s, %s, %s,%s)" % (
                                tweet_item['weibo_url'], tweet_item['user_id'],
                                tweet_item['content'],
                                tweet_item['created_at'],
                                tweet_item['repost_num'],
                                tweet_item['comment_num'],
                                tweet_item['like_num'],
                                tweet_item['crawl_time'],
                                tweet_item['sentiments'])
                            self.cursor.execute(sql)
                            self.db.commit()
                        except:
                            # 数据有重复
                            continue
                            pass
                    yield tweet_item

                    # 抓取该微博的评论信息
                    comment_url = self.base_url + '/comment/' + tweet_item[
                        'weibo_url'].split('/')[-1] + '?page=1'
                    yield Request(url=comment_url,
                                  callback=self.parse_comment,
                                  meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)

Example #14

Show file

File: sinaSpider.py Project: YelZhang/weiboSpider-1

 def parseTweets(self, response):
     if len(response.body) > 50:
         print "###########################"
         print "Fetch Tweets Success"
         print "###########################"
         ori_ID = response.meta['ori_id']
         tweets = json.loads(response.body)
         ID = response.meta["ID"]
         Owner = response.meta["owner"]
         page = ''
         containerid = ''
         if tweets.get("cards", ""):
             cards = tweets["cards"]
             if tweets["cardlistInfo"].get("page", ""):
                 page = tweets["cardlistInfo"]["page"]
                 page = str(page)
             else:
                 return
             # if tweets["cardlistInfo"].get("containerid", ""):
             #     containerid = tweets["cardlistInfo"]["containerid"]
             for card in cards:
                 mblog = card.get('mblog', '')
                 if mblog:
                     tweetsItems = TweetsItem()
                     tweetsItems["_id"] = mblog["id"]
                     tweetsItems["ID"] = ID
                     tweetsItems["Owner"] = Owner
                     tweetsItems["Used"] = False
                     tweetsItems['LocalImgs'] = []
                     tweetsItems["Content"] = json.dumps(mblog).decode(
                         'unicode-escape')
                     tweetsItems["PubTime"] = mblog["created_at"]
                     tweetsItems["Like"] = mblog["attitudes_count"]
                     tweetsItems["Comment"] = mblog["comments_count"]
                     tweetsItems["Transfer"] = mblog["reposts_count"]
                     tweetsItems["TweetsText"] = mblog["text"]
                     pics = mblog.get('pics', '')
                     if pics:
                         img_urls = []
                         small_img_urls = []
                         # print mblog["pics"]
                         for pic in pics:
                             url = pic["large"]['url']
                             surl = pic['url']
                             # print url
                             img_urls.append(url)
                             small_img_urls.append(surl)
                         tweetsItems["Imgs"] = img_urls
                         tweetsItems['SmallImgs'] = small_img_urls
                     else:
                         tweetsItems["Imgs"] = []
                         tweetsItems['SmallImgs'] = []
                 yield tweetsItems
             print "###########################"
             print "Tweetspage: " + page
             print "###########################"
             if page >= Tweets_Num:
                 print "###########################"
                 print "Fetch Tweets Finish"
                 print "###########################"
                 return
             # url_tweets = "https://m.weibo.cn/api/container/getIndex?type=uid&value={value}&containerid={ori_id}&page=%s" % (
             # ID, containerid, page)
             ori_url = 'https://m.weibo.cn/api/container/getIndex?containerid={ori_id}_-_WEIBO_SECOND_PROFILE_WEIBO_ORI' \
                       '&type=uid&page_type=03&value={value}&page={page}'.format(
                 ori_id=ori_ID, value=response.meta['ID'],page=page
             )
             yield Request(url=ori_url,
                           meta={"ID": ID},
                           callback=self.parseTweets,
                           dont_filter=True)
         else:
             return
     else:
         print "###########################"
         print "Fetch Tweets Finish"
         print "###########################"
         return