Exemple #1
0
    def parse_water_fall(self, response):
        # prepare data
        data = response.json()
        uid, folder = response.meta['uid'], response.meta['folder']

        # continue to next page
        cursor = data['next_cursor']
        yield scrapy.Request(api.get_water_fall(uid, cursor), callback=self.parse_water_fall, meta=response.meta)

        # yield all videos
        for video in data['list']:
            video, mid = video['page_info'], video['mid']
            video_type = video['object_type']

            match video_type:
                case 'video':
                    urls = [video['media_info'][key] for key in self.video_keys]
                    url = urls[0] if urls else ''
                    self.logger.info(f'{folder} found 1 video (from {response.url})')
                    yield WeiboItem(uuid=mid, filename=f'{folder}/{mid}.mp4', file_urls=[url])

                case 'story':
                    for i, slide in enumerate(video['slide_cover']['slide_videos']):
                        url = slide['url']
                        yield WeiboItem(uuid=f'{mid}_{i}', filename=f'{folder}/{mid}_{i}.mp4', file_urls=[url])

                case _:
                    self.logger.warning('Unknown video type "%s".', video_type)
Exemple #2
0
 def parse_img(self, response):
     item = WeiboItem()
     item['img_url'] = response.url
     item['img_id'] = response.meta['img_id']
     item['img_path'] = response.meta['dir'] + item[
         'img_id'] + response.url[-4:]
     yield item
 def parse_weibo(self, response):
     item = WeiboItem()
     print response.xpath('/html/head/title/text()').extract()[0]
     for p in response.xpath('//div[contains(@id,"M")]'):
         item['content'] =  p.xpath('./div/span/text()').extract()
         yield item
     yield Request("http://weibo.cn" + response.xpath('//*[@id="pagelist"]/form/div/a/@href').extract()[0], cookies=self.cookies, callback=self.parse_weibo)
Exemple #4
0
    def parse_user(self, response):
        user_item = response.meta['key']
        user_item['weibo_count'] = response.xpath(
            '//span[@class="tc"]/text()').re_first('\d+')
        user_item['following_count'] = response.xpath(
            '//a[contains(@href, "follow")]/text()').re_first('\d+')
        user_item['follower_count'] = response.xpath(
            '//a[contains(@href, "fans")]/text()').re_first('\d+')
        weibos = response.xpath('//div[contains(@id, "M")]')
        for weibo in weibos:
            weibo_item = WeiboItem()
            # print('♦️' * 100)
            # print(weibo.xpath('./@id').extract_first())
            # print('♦️' * 100)
            weibo_item['name'] = user_item['name']
            weibo_item['id'] = weibo.xpath('./@id').extract_first()
            weibo_item['text'] = weibo.xpath(
                './/span[@class="ctt"]/text()').extract_first()
            weibo_item['pictures'] = weibo.xpath(
                './/img[@alt="图片"]/@src').extract()
            weibo_item['videos'] = weibo.xpath(
                './/a[contains(text(), "视频")]/@href').extract_first()
            weibo_item['likes_count'] = weibo.xpath(
                './/a[contains(text(), "赞")]/text()').re_first('\d+')
            weibo_item['reposts_count'] = weibo.xpath(
                './/a[contains(text(), "转发")]/text()').re_first('\d+')
            weibo_item['comments_count'] = weibo.xpath(
                './/a[contains(text(), "评论")]/text()').re_first('\d+')
            # print(weibo_item['id'])
            yield weibo_item

        yield user_item
Exemple #5
0
    def get_user(self, response):
        if response.status == 200:
            data_script = response.xpath('//script/text()').extract()
            for s in data_script:
                if 'FM.view({"ns":"pl.content.followTab.index"' in s:
                    selector = js2xml_unescape(s)
                    nodes = selector.xpath('//li[@class="follow_item S_line2"]')

                    for n in nodes:
                        content = etree.tostring(n, pretty_print=True, encoding='utf8')
                        s = etree.HTML(unescape(content.decode()))
                        username = s.xpath('//li[@class="follow_item S_line2"]//dt/a/@title')[0]
                        href = s.xpath('//li[@class="follow_item S_line2"]//dt/a/@href')[0]
                        follow_c = s.xpath('//span[1]/em[@class="count"]/a/text()')
                        fans_c = s.xpath('//span[2]/em[@class="count"]/a/text()')
                        article_c = s.xpath('//span[3]/em[@class="count"]/a/text()')

                        if follow_c and fans_c and article_c:
                            # 有些公众账号涉及隐私,没有关注粉丝等数据,且不是收集目标,直接排除。
                            # 如 同性恋婚姻合法化:https://weibo.com/p/1008089993f2f8dc0d7a92bc4a28748d6e8fd0/super_index
                            if int(fans_c[0]) < 3000 and int(follow_c[0]) < 1000 and int(article_c[0]) > 1:
                                # 粉丝数量少于3000,关注少于1000,微博数大于1条
                                item = WeiboItem()
                                item["username"] = username
                                item['href'] = urljoin('https://weibo.com', href)
                                item['md5_index'] = get_md5(item['href'])
                                item['done'] = False
                                yield item

                    next_page_url = self._get_next_page(selector)
                    if next_page_url:
                        yield scrapy.Request(url=next_page_url, callback=self.get_user)
 def parse_weibos(self, response):
     result = json.loads(response.text)
     if result.get('ok') and result.get('data').get('cards'):
         weibos = result.get('data').get('cards')
         for weibo in weibos:
             mblog = weibo.get('mblog')
             if mblog:
                 weibo_item = WeiboItem()
                 field_map = {
                     'id': 'id',
                     'attitudes_count': 'attitudes_count',
                     'comments_count': 'comments_count',
                     'created_at': 'created_at',
                     'reposts_count': 'reposts_count',
                     'picture': 'original_pic',
                     'pictures': 'pics',
                     'source': 'source',
                     'text': 'text',
                     'raw_text': 'raw_text',
                     'thumbnail': 'thumbnail_pic'
                 }
                 for field, value in field_map.items():
                     weibo_item[field] = mblog.get(value)
                     weibo_item['user'] = response.meta.get('uid')
                     yield weibo_item
         #下一页微博
         uid = response.meta.get('uid')
         page = response.meta.get('page') + 1
         yield Request(self.weibo_url.format(uid=uid, page=page),
                       callback=self.parse_weibos,
                       meta={
                           'uid': uid,
                           'page': page
                       })
Exemple #7
0
    def parse(self, response):
        # 保存结果
        item = WeiboItem()
        resp_dict = json.loads(response.body_as_unicode())
        user = resp_dict['data']['user']
        # 用户id
        item['uid'] = user['id']
        # 用户名
        item['screen_name'] = user['screen_name']
        # 博文主页
        item['profile_url'] = user['profile_url']
        # 关注量
        item['follow_count'] = user['follow_count']
        # 粉丝
        item['followers_count'] = user['followers_count']
        print(user['screen_name'])
        yield item

        # 用户id
        uid = furl(response.url).args['uid']
        # 修改为当前用户id
        self.attention.args['containerid'] = '231051_-_followers_-_{}'.format(
            uid)
        if uid not in self.data:
            self.data.add(uid)
            yield scrapy.Request(url=self.attention.url,
                                 meta={'uid': uid},
                                 callback=self.parse_follow)
Exemple #8
0
 def parse(self, response):
     # print(response.text)
     item = WeiboItem()
     for p in response.xpath("//div[@class='c'and @id]"):
         try:
             text_transfrom = "".join(
                 p.xpath("./div/text()").re(r'[\u4e00-\u9fa5]'))
             text_fanart = "".join(
                 p.xpath("./div/span[@class='ctt']/text()").extract())
             item['text_fanart'] = text_fanart
             item['text_transfrom'] = text_transfrom
             item['like'] = "".join(
                 p.xpath("./div/a").re(r'赞\[[0-9]*?\]')).replace(
                     '赞[', '').replace(']', '')
             item['transmit'] = "".join(
                 p.xpath("./div/a").re(r'转发\[[0-9]*?\]')).replace(
                     '转发[', '').replace(']', '')
             item['comment'] = "".join(
                 p.xpath("./div/a").re(r'评论\[[0-9]*?\]')).replace(
                     '评论[', '').replace(']', '')
             time_from = "".join(
                 p.xpath("./div/span[@class='ct']/text()").extract()).split(
                     "\xa0来自")
             item['time'] = self.clear_date(time_from[0])
             item['_from'] = time_from[1]
             yield item
         except Exception as e:
             print(e)
             continue
 def parse_detail(self, response):
     url = response.url
     item = WeiboItem()
     item['content'] = response.xpath(
         '//div[@class="c" and @id="M_"]//span[@class="ctt"]//text()'
     ).extract()
     yield item
Exemple #10
0
 def parse_detail(self, response):
     print('-----parse_detail------')
     id = re.search('comment/(.*?)\?', response.url).group(1)
     url = response.url
     content = ''.join(
         response.xpath(
             '//div[@id="M_"]//span[@class="ctt"]//text()').extract())
     print(id, url, content)
     comment_count = response.xpath(
         '//span[@class="pms"]//text()').re_first('评论\[(.*?)\]')
     forward_count = response.xpath(
         '//a[contains(., "转发[")]//text()').re_first('转发\[(.*?)\]')
     like_count = response.xpath('//a[contains(., "赞[")]//text()').re_first(
         '赞\[(.*?)\]')
     print('转发: {} 评论: {} 赞: {}'.format(comment_count, forward_count,
                                        like_count))
     posted_at = response.xpath('//div[@id="M_"]//span[@class="ct"]//text()'
                                ).extract_first(default=None)
     user = response.xpath('//div[@id="M_"]/div/a/text()').extract_first(
         default=None)
     print(posted_at, user)
     weibo_item = WeiboItem()
     for field in weibo_item.fields:
         try:
             weibo_item[field] = eval(field)
         except NameError:
             self.logger.debug('Field No Defined!' + field)
     yield weibo_item
Exemple #11
0
    def parse_weibo(self, response):

        result = json.loads(response.text)

        if result.get('data') and result.get('data').get('cards'):

            weibo_list = result.get('data').get('cards')
            item = WeiboItem()
            field_map = {
                'created_at': 'created_at',
                'id': 'id',
                # 'attitudes_count':'attitudes_count',
            }
            for weibo in weibo_list:

                if weibo.get('mblog'):
                    mblog = weibo.get('mblog')
                    for k, v in field_map.items():
                        item[k] = mblog[v]

                yield item
        #下一页微博
        uid = response.meta.get('uid')
        page = response.meta.get('page') + 1
        self.log(uid)
        yield scrapy.Request(url=self.weibo_url.format(uid=uid, page=page),
                             callback=self.parse_weibo,
                             meta={
                                 'page': page,
                                 'uid': uid
                             })
Exemple #12
0
    def parse(self, response):
        data = json.loads(response.text)
        if data and 'data' in data:
            pattern = re.compile(
                '<div.*?list_title_b.*?<a href="(.*?)".*?_blank">(.*?)</a>.*?subinfo S_txt2">(.*?)</span></a>.*?'
                +
                'S_txt2">(.*?)</span>.*?praised S_ficon W_f16">ñ</em><em>(.*?)</em>.*?ficon_'
                +
                'repeat S_ficon W_f16">.*?</em><em>(.*?)</em>.*?forward S_ficon W_f16.*?</em><em>'
                + '(.*?)</em>.*?</div>', re.S)
            result = re.findall(pattern, data.get('data'))
            for info in result:
                item = WeiboItem()
                item['content'] = info[1]
                item['author'] = info[2]
                item['publishTime'] = info[3]
                item['repost'] = info[4]
                item['comment'] = info[5]
                item['approve'] = info[6]
                item['address'] = info[0]
                yield item

            if self.offset < 30:
                self.offset += 1
                url = self.base_url.format(self.offset)
                yield scrapy.Request(url, callback=self.parse)
Exemple #13
0
 def parse(self, response):
     result = json.loads(response.text)
     if result.get('ok') and result.get('data').get('cards'):
         weibos = result.get('data').get('cards')
         for weibo in weibos:
             myblog = weibo.get('mblog')
             if myblog:
                 item = WeiboItem()
                 field_map = {
                     'id': 'id',
                     'attitudes_count': 'attitudes_count',
                     'comments_count': 'comments_count',
                     'reposts_count': 'reposts_count',
                     'picture': 'original_pic',
                     # 'pictures': 'pics',
                     'created_at': 'created_at',
                     'source': 'source',
                     'text': 'text',
                     # 'raw_text': 'raw_text',
                     'thumbnail': 'thumbnail_pic',
                 }
                 for field, attr in field_map.items():
                     item[field] = myblog.get(attr)
                 item['user'] = response.meta.get('user_info').get('lfid')
                 yield item
Exemple #14
0
    def parse(self, response):
        # print("response.meta", response.meta)
        if ('page' in response.meta):
            page = response.meta['page']
        else:
            page = 1
        result = json.loads(response.text)
        # print(response.text)
        users = result.get("data").get("cards")[0]
        if (users is None):
            return
        for card in result.get("data").get("cards"):
            for group in card.get("card_group"):
                user = group.get("user")
                if (user):
                    weibo = WeiboItem()
                    weibo['uid'] = user['id']
                    weibo['nikename'] = user['screen_name']
                    weibo['faceurl'] = user['profile_image_url']
                    yield weibo

        request = scrapy.Request(
            "https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_3641513235&since_id=%d"
            % (page),
            callback=self.parse)
        request.meta['page'] = page + 1
        yield request
Exemple #15
0
 def parse(self, response):
     text_json = json.loads(response.body_as_unicode())
     self.since_id = text_json.get('data').get('cardlistInfo').get(
         'since_id')
     cards = text_json.get('data').get('cards')
     for it in cards:
         it_son = it.get('mblog')
         if it_son:
             self.created_at = it_son['created_at']
             self.text = it_son['text']
             self.source = it_son['source']
             self.scheme = it['scheme']
             self.reposts_count = it_son['reposts_count']
             self.comments_count = it_son['comments_count']
             self.attitudes_count = it_son['attitudes_count']
             soup = BeautifulSoup(str(self.text), "html.parser")
             self.text = soup.get_text()
             if len(self.created_at) < 6:
                 self.created_at = "%s%s" % ("2020-", self.created_at)
             self.textLength = len(self.text)
             items = WeiboItem(created_at=self.created_at,
                               text=self.text,
                               source=self.source,
                               scheme=self.scheme,
                               reposts_count=self.reposts_count,
                               comments_count=self.comments_count,
                               attitudes_count=self.attitudes_count,
                               textLength=self.textLength)
             yield items
     if not self.since_id:
         return
     urls = "%s%s" % (self.url, str(self.since_id))
     yield scrapy.Request(urls, callback=self.parse)
Exemple #16
0
 def parse(self, response):
     js = json.loads(response.text)
     weibo_items = js.get('data').get('cards')
     for weibo_item in weibo_items:
         item = WeiboItem()
         item['time'] = weibo_item.get('mblog').get('created_at')
         item['txt'] = weibo_item.get('mblog').get('text')
         yield item
Exemple #17
0
    def parse(self,response):

        item = WeiboItem()

        selector = Selector(response)


        tweets = selector.xpath('body/div[@class="c" and @id]')
        for tweet in tweets:

            device = 'None'

            #id = tweet.xpath('@id').extract_first()  # 微博ID
            id = tweet.xpath('div/a/text()').extract_first()
            #content = tweet.xpath('div/span[@class="ctt"]/text()').extract_first()  # 微博内容
            content = tweet.xpath('div/span[@class="ctt"]')
            info = content[0].xpath('string(.)').extract_first()

            cooridinates = tweet.xpath('div/a/@href').extract_first()  # 定位坐标
            like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract())  # 点赞数
            transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract())  # 转载数
            comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract())  # 评论数
            others = tweet.xpath('div/span[@class="ct"]/text()').extract_first()  # 求时间和使用工具(手机或平台)

            print id
            print info



            if others :
                others = others.split(u"\u6765\u81ea")
                pubday = others[0]
                if len(others)>1:
                    device= others[1]


            if like :

                like = int(like[0])
                print like

            if transfer:
                transfer = int(transfer[0])
                print transfer

            if comment:
                comment = int(comment[0])
                print comment

            item['nickname']=id
            item['content'] = info
            item['pubday'] = pubday
            item['device'] = device
            item['like'] = like
            item['transfer'] = transfer
            item['comment'] = comment

            yield item
    def parse_1(self, response):
        # 初始化 selenium 引擎,打开一个浏览器模拟人的行为
        # 避免被反爬虫
        self.config_driver()
        print("Open driver...")
        self.driver.get(response.url)
        # 让浏览器能获取足够的数据,加载完成
        time.sleep(20)  # Let the user actually see something!
        page = 1

        # 微博的首页热门微博通过下拉出现新的微博
        # 尽量模仿人的行为去浏览网页,间隔一段时间下拉
        # 下拉的距离也稍微随机一下
        # 偶尔出现网页加载不完全的情况,可以手动刷新一下
        # 可能和网络状况有关,有的时候正常浏览也加载不出来,大概是微博自身的问题
        while (page <= 30):
            time.sleep(max(random.random(), random.gauss(0.5, 1.0)))
            print(f"下拉 {page} 次...")
            self.driver.execute_script("window.scrollBy(0,{})".format(
                max(0, int(random.gauss(150.0, 25.0)))))
            page += 1

        source = self.driver.page_source
        # 调用 BeautifulSoup 解析 html
        soup = BeautifulSoup(source, 'lxml')
        container = soup.find('div', class_='UG_contents')
        items = container.find_all('div', class_=re.compile('UG_list_*'))
        print("此次共获取 {} 条记录".format(len(items)))
        for item_des in items:
            url = '#'
            wb_from = ''
            if item_des.has_attr('href'):
                url = item_des['href']
            elif item_des.find('div', 'vid'):
                if item_des.find('div', 'vid').has_attr('href'):
                    url = item_des.find('div', 'vid')['href']
            if len(url) < 2:
                continue
            item = item_des.find('h3', class_=re.compile('list_title_*'))
            emojis = []
            if item:
                item_text = item.text.strip()
                item_emojis = item.find_all('img', alt=True)
                if item_emojis:
                    for item_emoji in item_emojis:
                        if item_emoji and item_emoji.has_attr('alt'):
                            emojis.append(item_emoji['alt'])
                weibo_item = WeiboItem()
                weibo_item['url'] = url
                # print(item_text, ''.join(emojis))
                content = remove_multi_space(item_text).strip()
                weibo_item['content'] = content + ''.join(emojis)
                sub_info = item_des.find('span', 'subinfo')
                if sub_info and sub_info.text:
                    wb_from = sub_info.text.strip()
                weibo_item['wb_from'] = wb_from
                yield weibo_item
Exemple #19
0
 def parse(self, response):
     catelog = parse.parse_qs(
         parse.urlparse(parse.unquote(response.url)).query)['q']
     item = WeiboItem()
     cardlist = response.xpath(
         "//div[@class='card-feed']/div[@class='content']//li/img/@src"
     ).extract()
     item['catelog'] = catelog
     item['photos'] = cardlist
     yield item
Exemple #20
0
 def parse_content(self, response):
     # item = XiaoshuoItem()
     item = WeiboItem()
     item['url'] = response.url
     item['title'] = response.xpath(
         '//div[@class="bookname"]/h1/text()').extract_first()
     item['content'] = response.xpath(
         '//div[@id="content"]/text()').extract()
     nextpage = response.xpath(
         '//div/a[contains(text(),"下一章")]/@href').extract_first()
     yield item
     print(item['title'])
 def parse(self, response):
     depth = response.meta['depth']
     fail_count = response.meta['fail_count']
     responseUrl = response.url
     requestUrl = response.meta['url']
     #判断请求网页与返回网页的url一致性,不一样则重新请求
     if not str(responseUrl).__eq__(requestUrl):
         #判断失败次数,超过两次放弃
         if fail_count <= 1:
             fail_count += 1
             yield scrapy.Request(requestUrl,
                                  callback=self.parse,
                                  method='GET',
                                  dont_filter=True,
                                  meta={
                                      'depth': depth,
                                      'url': requestUrl,
                                      'fail_count': fail_count
                                  },
                                  headers=self.headers,
                                  cookies=self.cookie)
         else:
             #记录请求失败的网页
             item = FailUrlItem()
             item['fail_url'] = requestUrl
             yield item
     else:
         item = WeiboItem()
         data = response.body.decode()
         html_etree = self.dataToHtml(data)
         item['focus_by'] = html_etree.xpath('//h1/text()')[0]
         focus_li = html_etree.xpath(
             '//div[@class="title W_fb W_autocut "]')
         for focus in focus_li:
             item['name'] = focus.xpath('.//a/text()')[0]
             focus_url = focus.xpath('.//a[@class="S_txt1"]/@href')
             item['url'] = 'https://weibo.com' + focus_url[0].split('?')[0]
             item['fans_num'] = 10000000
             url = item[
                 'url'] + '/follow?from=page_100306&wvr=6&mod=headfollow'
             yield item
             yield scrapy.Request(url,
                                  callback=self.parse_focus_list,
                                  method='GET',
                                  dont_filter=True,
                                  meta={
                                      'depth': depth,
                                      'url': url,
                                      'fail_count': fail_count
                                  },
                                  headers=self.headers,
                                  cookies=self.cookie)
Exemple #22
0
 def getItem(self, card):
     for j in range(len(card)):
         card_group = card[j]
         item = WeiboItem()
         desc1 = card_group.get("desc1")
         desc2 = card_group.get("desc2")
         id = card_group.get('user').get("id")
         name = card_group.get('user').get("screen_name")
         item['name'] = name
         item['weiboid'] = id
         item["desc1"] = desc1
         item["desc2"] = desc2
         return item
Exemple #23
0
    def parse4(self, response):
        response = response.text
        response = json.loads(response)
        # print(response)  #这里有可能是暂无数据

        # print(len(response['data']['data']))
        for i in range(len(response['data']['data'])):
            userid = response['data']['data'][i]['user']['id']
            username = response['data']['data'][i]['user']['screen_name']
            Item = WeiboItem()
            Item["userid"] = userid
            Item["username"] = username
        yield Item
Exemple #24
0
 def item_parse(self, response):
     imgs = response.xpath("//img/@src").extract()
     for item in imgs:
         myitem = WeiboItem()  # 实例化item相当于定义一个类
         myitem['image_urls'] = item
         yield myitem
     next_link = response.xpath(
         "//div[@class='NewPages']/ul//a/@href").extract()
     if next_link[len(next_link) - 1] != '#':
         yield scrapy.Request(
             "http://www.umei.cc/meinvtupian/meinvxiezhen/" +
             next_link[len(next_link) - 1],
             callback=self.item_parse)
     return imgs
Exemple #25
0
 def parse2(self, response):
     response = response.text
     response = json.loads(response)
     for i in range(len(response['data']['data'])):
         userid = response['data']['data'][i]['user']['id']
         username = response['data']['data'][i]['user']['screen_name']
         # print(userid, username)
         Item = WeiboItem()
         Item["userid"] = userid
         Item["username"] = username
         self.count = self.count + 1
         print(self.count)
         time.sleep(1)
         yield Item
Exemple #26
0
    def parse_person(self, response):
        """
        解析个人信息
        :param response:
        :return:
        """
        jsonObj = json.loads(response.text)

        # 获取信息出错
        if jsonObj.get('msg'):
            # 再次调用parse_person
            yield Request(url=response.url,
                          headers=self.headers,
                          callback=self.parse_person)
        item = WeiboItem()
        # 从response.url中截取uid
        uid = response.url.split('_')[0].split('=')[1][6:]
        item['uid'] = uid

        # 遍历cards
        cards = jsonObj.get('cards') or []
        for something in cards:
            card_group = something.get('card_group') or []

            # 遍历card_group,逐项寻找感兴趣的信息
            for card in card_group:
                item_name = card.get('item_name')
                # logging.debug(item_name)
                if '昵称' == item_name:
                    item['name'] = card.get('item_content')
                elif '性别' == item_name:
                    item['gender'] = card.get('item_content')
                elif '所在地' == item_name:
                    item['location'] = card.get('item_content')
                elif '简介' == item_name:
                    item['intro'] = card.get('item_content')
                elif '注册时间' == item_name:
                    item['signin'] = card.get('item_content')
                elif '学校' == item_name:
                    item['school'] = card.get('item_content')
                elif '公司' == item_name:
                    item['company'] = card.get('item_content')

        yield item

        # 获取此人粉丝信息
        yield Request(url='{0}?containerid=100505{1}_-_FOLLOWERS'.format(
            self.getSecond, uid),
                      headers=self.headers,
                      callback=self.parse_followers)
Exemple #27
0
 def start_requests(self):
     """
     以一个或者多个微博id为根节点微博,开始根据转发关系递归爬取,并传递自己的id给转发结点作为父亲id
     """
     #cookies = self.cookies_list[2]
     for weibo_id in self.start_weibo_id:
         self.weibo_id_set.add(weibo_id)
         item = WeiboItem()
         item['weibo_id'] = weibo_id
         item['father_weibo_id'] = 'ROOT'
         yield scrapy.Request(url=self.weibo_template % (weibo_id, 1),
                              cookies=self.get_random_cookies(),
                              meta={'item': item},
                              callback=self.parse_content)
Exemple #28
0
 def parse_info(self, response):
     user = WeiboItem()
     info_mapping = {
         u'昵称': 'screen_name',
         u'性别': 'gender',
         u'所在地': 'location',
         u'生日': 'birthday'
     }
     user['uid'] = response.meta['uid']
     for info in response.xpath('//div[@class="item-info-page"]'):
         if info.xpath('span/text()').extract():
             key = info.xpath('span/text()').extract()[0]
         if key in info_mapping:
             user[info_mapping[key]] = info.xpath('p/text()').extract()[0]
     return user
Exemple #29
0
 def parse(self, response):
     item = WeiboItem()
     conts = []
     datas = response.xpath('//div[@class="c"]/div/span[@class="ctt"]')
     for i in range(0, len(datas)):
         conts.append(datas[i].xpath('string(.)').extract())
     item['content'] = conts
     item['times'] = response.xpath('//span[@class="ct"]/text()').extract()
     item['likes'] = response.xpath(
         '//div[@class="c"]/div[2]/a[3]/text()').extract()
     item['comments'] = response.xpath(
         '//div[@class="c"]/div/a[@class="cc"]/text()').extract()
     item['transfer'] = response.xpath(
         '//div[@class="c"]/div[2]/a[4]/text()').extract()
     yield item
    def parse_search(self, response):
        res = json.loads(response.text)
        if (str(res['ok']) == '1'):
            for key in res['data']['cards'][0]['card_group']:
                item = WeiboItem()
                # weibo info
                item['weibo_id'] = key['mblog']['id']
                item['weibo_content'] = self.filt_re.sub(
                    '', key['mblog']['text'])
                item['weibo_repost'] = key['mblog']['reposts_count']
                item['weibo_comment'] = key['mblog']['comments_count']
                item['weibo_like'] = key['mblog']['attitudes_count']

                timestr = str(key['mblog']['created_at'])
                ctime = self.get_ctime(timestr)
                item['weibo_ctime'] = ctime

                url_pos = key['scheme'].index("?mblogid")
                item['weibo_url'] = key['scheme'][0:url_pos]

                # user info
                item['user_id'] = str(key['mblog']['user']['id'])
                item['user_name'] = key['mblog']['user']['screen_name']
                item['user_weibo'] = key['mblog']['user']['statuses_count']
                item['user_follow'] = key['mblog']['user']['follow_count']
                item['user_fan'] = key['mblog']['user']['followers_count']
                item['user_verified'] = key['mblog']['user']['verified']

                user_url = key['mblog']['user']['profile_url']
                uurl_pos = user_url.index('?uid=')
                item['user_url'] = user_url[0:uurl_pos]
                item['mediaName'] = self.mediaName
                item['keyword'] = response.meta['keyword']

                # print('Weibo crawled at %s' %ctime)
                if item['weibo_content'].strip() == '':
                    return

                for word in self.filterword:
                    if item['weibo_content'].find(word) != -1:
                        return

                yield item
                #yield scrapy.Request(url=key['scheme'],callback=self.parse)
        else:
            # i=i-1
            if ('msg' in res.keys()):
                print(res['msg'])