コード例 #1
0
 def parse_weibo(self, response):
     """解析网页中的微博信息"""
     keyword = response.meta.get('keyword')
     for sel in response.xpath("//div[@class='card-wrap']"):
         info = sel.xpath(
             "div[@class='card']/div[@class='card-feed']/div[@class='content']/div[@class='info']"
         )
         if info:
             weibo = WeiboItem()
             weibo['id'] = sel.xpath('@mid').extract_first()
             weibo['bid'] = sel.xpath(
                 '(.//p[@class="from"])[last()]/a[1]/@href').extract_first(
                 ).split('/')[-1].split('?')[0]
             weibo['user_id'] = info[0].xpath(
                 'div[2]/a/@href').extract_first().split('?')[0].split(
                     '/')[-1]
             weibo['screen_name'] = info[0].xpath(
                 'div[2]/a/@nick-name').extract_first()
             txt_sel = sel.xpath('.//p[@class="txt"]')[0]
             retweet_sel = sel.xpath('.//div[@class="card-comment"]')
             retweet_txt_sel = ''
             if retweet_sel and retweet_sel[0].xpath('.//p[@class="txt"]'):
                 retweet_txt_sel = retweet_sel[0].xpath(
                     './/p[@class="txt"]')[0]
             content_full = sel.xpath(
                 './/p[@node-type="feed_list_content_full"]')
             is_long_weibo = False
             is_long_retweet = False
             if content_full:
                 if not retweet_sel:
                     txt_sel = content_full[0]
                     is_long_weibo = True
                 elif len(content_full) == 2:
                     txt_sel = content_full[0]
                     retweet_txt_sel = content_full[1]
                     is_long_weibo = True
                     is_long_retweet = True
                 elif retweet_sel[0].xpath(
                         './/p[@node-type="feed_list_content_full"]'):
                     retweet_txt_sel = retweet_sel[0].xpath(
                         './/p[@node-type="feed_list_content_full"]')[0]
                     is_long_retweet = True
                 else:
                     txt_sel = content_full[0]
                     is_long_weibo = True
             weibo['text'] = txt_sel.xpath(
                 'string(.)').extract_first().replace('\u200b', '').replace(
                     '\ue627', '')
             weibo['article_url'] = self.get_article_url(txt_sel)
             weibo['location'] = self.get_location(txt_sel)
             if weibo['location']:
                 weibo['text'] = weibo['text'].replace(
                     '2' + weibo['location'], '')
             weibo['text'] = weibo['text'][2:].replace(' ', '')
             if is_long_weibo:
                 weibo['text'] = weibo['text'][:-6]
             weibo['at_users'] = self.get_at_users(txt_sel)
             weibo['topics'] = self.get_topics(txt_sel)
             reposts_count = sel.xpath(
                 './/a[@action-type="feed_list_forward"]/text()'
             ).extract_first()
             try:
                 reposts_count = re.findall(r'\d+.*', reposts_count)
             except TypeError:
                 print('cookie无效或已过期,请按照'
                       'https://github.com/dataabc/weibo-search#如何获取cookie'
                       ' 获取cookie')
                 raise CloseSpider()
             weibo['reposts_count'] = reposts_count[
                 0] if reposts_count else '0'
             comments_count = sel.xpath(
                 './/a[@action-type="feed_list_comment"]/text()'
             ).extract_first()
             comments_count = re.findall(r'\d+.*', comments_count)
             weibo['comments_count'] = comments_count[
                 0] if comments_count else '0'
             attitudes_count = sel.xpath(
                 '(.//a[@action-type="feed_list_like"])[last()]/em/text()'
             ).extract_first()
             weibo['attitudes_count'] = (attitudes_count
                                         if attitudes_count else '0')
             created_at = sel.xpath(
                 '(.//p[@class="from"])[last()]/a[1]/text()').extract_first(
                 ).replace(' ', '').replace('\n', '').split('前')[0]
             weibo['created_at'] = util.standardize_date(created_at)
             source = sel.xpath('(.//p[@class="from"])[last()]/a[2]/text()'
                                ).extract_first()
             weibo['source'] = source if source else ''
             pics = ''
             is_exist_pic = sel.xpath(
                 './/div[@class="media media-piclist"]')
             if is_exist_pic:
                 pics = is_exist_pic[0].xpath('ul[1]/li/img/@src').extract()
                 pics = [pic[2:] for pic in pics]
                 pics = [
                     re.sub(r'/.*?/', '/large/', pic, 1) for pic in pics
                 ]
                 pics = ['http://' + pic for pic in pics]
             video_url = ''
             is_exist_video = sel.xpath(
                 './/div[@class="thumbnail"]/a/@action-data')
             if is_exist_video:
                 video_url = is_exist_video.extract_first()
                 video_url = unquote(
                     str(video_url)).split('video_src=//')[-1]
                 video_url = 'http://' + video_url
             if not retweet_sel:
                 weibo['pics'] = pics
                 weibo['video_url'] = video_url
             else:
                 weibo['pics'] = ''
                 weibo['video_url'] = ''
             weibo['retweet_id'] = ''
             if retweet_sel and retweet_sel[0].xpath(
                     './/div[@node-type="feed_list_forwardContent"]/a[1]'):
                 retweet = WeiboItem()
                 retweet['id'] = retweet_sel[0].xpath(
                     './/a[@action-type="feed_list_like"]/@action-data'
                 ).extract_first()[4:]
                 retweet['bid'] = retweet_sel[0].xpath(
                     './/p[@class="from"]/a/@href').extract_first().split(
                         '/')[-1].split('?')[0]
                 info = retweet_sel[0].xpath(
                     './/div[@node-type="feed_list_forwardContent"]/a[1]'
                 )[0]
                 retweet['user_id'] = info.xpath(
                     '@href').extract_first().split('/')[-1]
                 retweet['screen_name'] = info.xpath(
                     '@nick-name').extract_first()
                 retweet['text'] = retweet_txt_sel.xpath(
                     'string(.)').extract_first().replace('\u200b',
                                                          '').replace(
                                                              '\ue627', '')
                 retweet['article_url'] = self.get_article_url(
                     retweet_txt_sel)
                 retweet['location'] = self.get_location(retweet_txt_sel)
                 if retweet['location']:
                     retweet['text'] = retweet['text'].replace(
                         '2' + retweet['location'], '')
                 retweet['text'] = retweet['text'][2:].replace(' ', '')
                 if is_long_retweet:
                     retweet['text'] = retweet['text'][:-6]
                 retweet['at_users'] = self.get_at_users(retweet_txt_sel)
                 retweet['topics'] = self.get_topics(retweet_txt_sel)
                 reposts_count = retweet_sel[0].xpath(
                     './/ul[@class="act s-fr"]/li/a[1]/text()'
                 ).extract_first()
                 reposts_count = re.findall(r'\d+.*', reposts_count)
                 retweet['reposts_count'] = reposts_count[
                     0] if reposts_count else '0'
                 comments_count = retweet_sel[0].xpath(
                     './/ul[@class="act s-fr"]/li[2]/a[1]/text()'
                 ).extract_first()
                 comments_count = re.findall(r'\d+.*', comments_count)
                 retweet['comments_count'] = comments_count[
                     0] if comments_count else '0'
                 attitudes_count = retweet_sel[0].xpath(
                     './/a[@action-type="feed_list_like"]/em/text()'
                 ).extract_first()
                 retweet['attitudes_count'] = (attitudes_count
                                               if attitudes_count else '0')
                 created_at = retweet_sel[0].xpath(
                     './/p[@class="from"]/a[1]/text()').extract_first(
                     ).replace(' ', '').replace('\n', '').split('前')[0]
                 retweet['created_at'] = util.standardize_date(created_at)
                 source = retweet_sel[0].xpath(
                     './/p[@class="from"]/a[2]/text()').extract_first()
                 retweet['source'] = source if source else ''
                 retweet['pics'] = pics
                 retweet['video_url'] = video_url
                 retweet['retweet_id'] = ''
                 yield {'weibo': retweet, 'keyword': keyword}
                 weibo['retweet_id'] = retweet['id']
             print(weibo)
             yield {'weibo': weibo, 'keyword': keyword}
コード例 #2
0
ファイル: search.py プロジェクト: barnett2010/weibo-search
    def parse_weibo(self, response):
        """解析网页中的微博信息"""
        for sel in response.xpath("//div[@class='card-wrap']"):
            info = sel.xpath(
                "div[@class='card']/div[@class='card-feed']/div[@class='content']/div[@class='info']"
            )
            if info:
                weibo = WeiboItem()
                weibo['id'] = sel.xpath('@mid').extract_first()

                weibo['user_id'] = info[0].xpath(
                    'div[2]/a/@href').extract_first().split('?')[0].split(
                        '/')[-1]
                weibo['nick_name'] = info[0].xpath(
                    'div[2]/a/@nick-name').extract_first()
                weibo['txt'] = sel.xpath('.//p[@class="txt"]')[0].xpath(
                    'string(.)').extract_first().replace('\u200b', '').replace(
                        '\ue627', '')
                reposts_count = sel.xpath(
                    './/a[@action-type="feed_list_forward"]/text()'
                ).extract_first()
                reposts_count = re.findall(r'\d+.*', reposts_count)
                weibo['reposts_count'] = reposts_count[
                    0] if reposts_count else '0'
                comments_count = sel.xpath(
                    './/a[@action-type="feed_list_comment"]/text()'
                ).extract_first()
                comments_count = re.findall(r'\d+.*', comments_count)
                weibo['comments_count'] = comments_count[
                    0] if comments_count else '0'
                attitudes_count = sel.xpath(
                    '(.//a[@action-type="feed_list_like"])[last()]/em/text()'
                ).extract_first()
                weibo['attitudes_count'] = (attitudes_count
                                            if attitudes_count else '0')
                created_at = sel.xpath(
                    '(.//p[@class="from"])[last()]/a[1]/text()').extract_first(
                    ).replace(' ', '').replace('\n', '').split('前')[0]
                weibo['created_at'] = util.standardize_date(created_at)
                source = sel.xpath('(.//p[@class="from"])[last()]/a[2]/text()'
                                   ).extract_first()
                weibo['source'] = source if source else ''

                retweet_sel = sel.xpath('.//div[@class="card-comment"]')
                if retweet_sel:
                    retweet = WeiboItem()
                    retweet_id = retweet_sel[0].xpath(
                        './/a[@action-type="feed_list_like"]/@action-data'
                    ).extract_first()[4:]
                    retweet['id'] = retweet_id
                    info = retweet_sel[0].xpath(
                        './/div[@node-type="feed_list_forwardContent"]/a[1]'
                    )[0]
                    retweet['user_id'] = info.xpath(
                        '@href').extract_first().split('/')[-1]
                    retweet['nick_name'] = info.xpath(
                        '@nick-name').extract_first()
                    retweet['txt'] = retweet_sel[0].xpath(
                        './/p[@class="txt"]')[0].xpath(
                            'string(.)').extract_first().replace(
                                '\u200b', '').replace('\ue627', '')
                    reposts_count = retweet_sel[0].xpath(
                        './/ul[@class="act s-fr"]/li/a[1]/text()'
                    ).extract_first()
                    reposts_count = re.findall(r'\d+.*', reposts_count)
                    retweet['reposts_count'] = reposts_count[
                        0] if reposts_count else '0'
                    comments_count = retweet_sel[0].xpath(
                        './/ul[@class="act s-fr"]/li[2]/a[1]/text()'
                    ).extract_first()
                    comments_count = re.findall(r'\d+.*', comments_count)
                    retweet['comments_count'] = comments_count[
                        0] if comments_count else '0'
                    attitudes_count = retweet_sel[0].xpath(
                        './/a[@action-type="feed_list_like"]/em/text()'
                    ).extract_first()
                    retweet['attitudes_count'] = (attitudes_count
                                                  if attitudes_count else '0')
                    created_at = retweet_sel[0].xpath(
                        './/p[@class="from"]/a[1]/text()').extract_first(
                        ).replace(' ', '').replace('\n', '').split('前')[0]
                    retweet['created_at'] = util.standardize_date(created_at)
                    source = retweet_sel[0].xpath(
                        './/p[@class="from"]/a[2]/text()').extract_first()
                    retweet['source'] = source if source else ''
                    yield retweet
                    weibo['retweet_id'] = retweet_id
                print(weibo)
                yield weibo