Beispiel #1
0
 def crawl(self):
     key = self.key
     data = self.data
     homepage = "http://card.weibo.com/article/aj/articleshow?cid=" + key
     url = "http://weibo.com/p/" + key
     html_stream = _get_url(homepage)
     json_stream = change_to_json(str(html_stream.text))
     html_stream = json_stream['data']['article']
     soup = HandleContent.get_BScontext(html_stream, text=True)
     title = soup.select('.title')[0].text
     pubtime = soup.select('.time')[0].text
     pubtime = HandleContent.strformat(str(pubtime))
     content = soup.select('.WBA_content')[0]
     content = clear_label(list(content))
     comment = {}
     text = HandleContent.get_BScontext(content, text=True).text
     comment['content'] = clear_space(text)
     publishers = soup.select('.S_link2')
     # author = reduce(lambda x, y: x + y, [item.text for item in authors])
     try:
         publisher = publishers[1].text if len(
             publishers) > 1 else publishers[0].text
     except:
         publisher = ''
     crawl_data = {}
     date = new_time()
     crawl_data = {
         'title': title,
         'pubtime': pubtime,
         'source': 'weibo',
         'publisher': publisher,
         'crtime_int': date.get('crtime_int'),
         'crtime': date.get('crtime'),
         'origin_source': u'微博搜索',
         'url': url,
         'key': data.get('key', ''),
         'type': u'元搜索',
         'source_type': data.get('source_type', ''),
         'content': content,
         'comment': comment,
     }
     model = SearchArticleModel(crawl_data)
     export(model)
Beispiel #2
0
 def crawl(self):
     key = self.key
     data = self.data
     homepage = "http://card.weibo.com/article/aj/articleshow?cid="+ key
     url = "http://weibo.com/p/"+ key
     html_stream = _get_url(homepage)    
     json_stream = change_to_json(str(html_stream.text))
     html_stream = json_stream['data']['article']
     soup = HandleContent.get_BScontext(html_stream, text=True)
     title = soup.select('.title')[0].text
     pubtime = soup.select('.time')[0].text
     pubtime = HandleContent.strformat(str(pubtime))
     content = soup.select('.WBA_content')[0]
     content = clear_label(list(content))
     comment = {}
     text = HandleContent.get_BScontext(content, text=True).text
     comment['content'] = clear_space(text)
     publishers = soup.select('.S_link2')
     # author = reduce(lambda x, y: x + y, [item.text for item in authors])
     try:
         publisher = publishers[1].text if len(publishers)> 1 else publishers[0].text
     except:
         publisher = ''
     crawl_data = {}
     date = new_time()
     crawl_data = {
         'title': title,
         'pubtime': pubtime,
         'source': 'weibo',
         'publisher': publisher,
         'crtime_int': date.get('crtime_int'),
         'crtime': date.get('crtime'),
         'origin_source': u'微博搜索',
         'url': url,
         'key': data.get('key', ''),
         'type': u'元搜索',
         'source_type': data.get('source_type', ''),
         'content': content,
         'comment': comment,
     }
     model = SearchArticleModel(crawl_data)
     export(model)
Beispiel #3
0
 def crawl(self): 
     key = str(self.key)
     data = self.data
     homepage = "http://api.weibo.cn/2/cardlist?\
                 gsid=_2A254IZdKDeTxGeRM7lUR8CnKyT2IHXVZdq2CrDV6PUJbrdAKLUf7kWptw4_No8F1OjQMCarBH4hZxZcrwA..&\
                 wm=3333_2001&i=27bd163&b=1&from=1051293010&c=iphone&v_p=18&skin=default&\
                 v_f=1&s=d2672a12&lang=zh_CN&ua=iPhone7,2__weibo__5.1.2__iphone__os8.1.3&\
                 uicode=10000198&featurecode=10000085&luicode=10000003&count=20&\
                 extparam=100103type=1&cuid=2257007621&sid=t_wap_ios&category=1&\
                 pos=1_-1&wm=3333_2001&containerid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&\
                 fid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&lfid=100103type%3D1&\
                 sourcetype=page&lcardid=user&page=1"
     # homepage = "http://api.weibo.cn/2/guest/cardlist?gsid=4wMJ47123kZuG0fKGxlRC15McKa50&uid=1001503246310&\
     #             wm=3333_2001&i=27bd163&b=0&from=1052093010&checktoken=c54259b09129d101b9669b5d93a04c0e&c=iphone&\
     #             v_p=18&skin=default&v_f=1&s=8a12fc6c&did=38d63734cc7427ebb2cb77612c1948cf&lang=zh_CN&ua=iPhone7,\
     #             2__weibo__5.2.0__iphone__os8.2&uid=1001503246310&extparam=100103\
     #             type%3D1%26q%3D%E5%8C%97%E4%BA%AC%E5%AE%89%E7%9B%91%26t%3D0%26sid%3Dt_wap_ios%26category%3D1%26pos%3D1_-1%26wm%3D3333_2001&\
     #             count=20&luicode=10000003&containerid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&featurecode=10000085&\
     #             uicode=10000198&fid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&checktoken=\
     #             c54259b09129d101b9669b5d93a04c0e&did=38d63734cc7427ebb2cb77612c1948cf&page=1"
     homepage = clear_space(homepage)
     html_stream = _get_url(homepage)     
     json_stream = change_to_json(str(html_stream.text))
     cards = json_stream['cards']
     for item in cards:
         scheme = re.search(r'=(.+?)$', item.get('scheme',''))
         scheme = scheme.group(1) if scheme else ''
         url = "http://weibo.com/%s/%s?type=comment"%(data.get('id', ''),
                  scheme)
         item = item.get('mblog',{})
         item = item.get('retweeted_status',item)
         text = item.get('text','')
         title = re.search(ur'【(.+?)】', text)
         title = title.group(1) if title else ''
         if not title:
             title = re.search(ur'#(.+?)#', text)
             title = title.group(1) if title else text[0:20]+'...'
         subtitle = re.search(ur'#(.+?)#', text)           
         subtitle = subtitle.group(1) if subtitle else ''
         pubtime = item.get('created_at', '')
         pubtime = HandleContent.strformat(str(pubtime))
         reposts_count = item.get('reposts_count', '')
         comments_count = item.get('comments_count', '')
         attitudes_count = item.get('attitudes_count', '')
         thumbnail_pic = item.get('thumbnail_pic', '')
         bmiddle_pic = item.get('bmiddle_pic', '')
         original_pic = item.get('original_pic', '')
         mid = item.get('mid', '')
         author = item.get('user',{}).get('name','')
         comment = {}
         comment = {
             'reposts_count': str(reposts_count),
             'attitudes_count': str(attitudes_count),
             'comments_count': str(comments_count)
         }
         crawl_data = {}
         subtitles = []
         subtitles.append(subtitle)
         date = new_time()
         crawl_data = {
             'province': self.data.get('province',''),
             'city': self.data.get('city',''),
             'district': self.data.get('district',''),
             'url': url,
             'title': title,
             'subtitle': subtitles,
             'content': text,
             'pubtime': pubtime,
             'crtime_int': date.get('crtime_int'),
             'crtime': date.get('crtime'),
             'source': 'weibo',
             'publisher': self.data.get('publisher',''),
             'author': author,
             'origin_source': u'新浪微博',
             'type': u'微博',
             'comment': comment
         }
         model = WeiboArticleModel(crawl_data)
         if export(model):
             againt_data = {}
             againt_data = {
                 'wid': model['id'],
                 'type': u'微博',
                 'expire': date.get('crtime_int')/1000000 + 604800,
             }
             Scheduler.schedule(AgainCrawler.type, key=mid, data=againt_data,
                              reset=True, interval=21600)
         else:
             pass
Beispiel #4
0
    def crawl(self):
        key = str(self.key)
        data = self.data
        homepage = "http://api.weibo.cn/2/cardlist?\
                    gsid=_2A254IZdKDeTxGeRM7lUR8CnKyT2IHXVZdq2CrDV6PUJbrdAKLUf7kWptw4_No8F1OjQMCarBH4hZxZcrwA..&\
                    wm=3333_2001&i=27bd163&b=1&from=1051293010&c=iphone&v_p=18&skin=default&\
                    v_f=1&s=d2672a12&lang=zh_CN&ua=iPhone7,2__weibo__5.1.2__iphone__os8.1.3&\
                    uicode=10000198&featurecode=10000085&luicode=10000003&count=20&\
                    extparam=100103type=1&cuid=2257007621&sid=t_wap_ios&category=1&\
                    pos=1_-1&wm=3333_2001&containerid=" + key + "_-_WEIBO_SECOND_PROFILE_WEIBO&\
                    fid=" + key + "_-_WEIBO_SECOND_PROFILE_WEIBO&lfid=100103type%3D1&\
                    sourcetype=page&lcardid=user&page=1"

        # homepage = "http://api.weibo.cn/2/guest/cardlist?gsid=4wMJ47123kZuG0fKGxlRC15McKa50&uid=1001503246310&\
        #             wm=3333_2001&i=27bd163&b=0&from=1052093010&checktoken=c54259b09129d101b9669b5d93a04c0e&c=iphone&\
        #             v_p=18&skin=default&v_f=1&s=8a12fc6c&did=38d63734cc7427ebb2cb77612c1948cf&lang=zh_CN&ua=iPhone7,\
        #             2__weibo__5.2.0__iphone__os8.2&uid=1001503246310&extparam=100103\
        #             type%3D1%26q%3D%E5%8C%97%E4%BA%AC%E5%AE%89%E7%9B%91%26t%3D0%26sid%3Dt_wap_ios%26category%3D1%26pos%3D1_-1%26wm%3D3333_2001&\
        #             count=20&luicode=10000003&containerid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&featurecode=10000085&\
        #             uicode=10000198&fid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&checktoken=\
        #             c54259b09129d101b9669b5d93a04c0e&did=38d63734cc7427ebb2cb77612c1948cf&page=1"
        homepage = clear_space(homepage)
        html_stream = _get_url(homepage)
        json_stream = change_to_json(str(html_stream.text))
        cards = json_stream['cards']
        for item in cards:
            scheme = re.search(r'=(.+?)$', item.get('scheme', ''))
            scheme = scheme.group(1) if scheme else ''
            url = "http://weibo.com/%s/%s?type=comment" % (data.get(
                'id', ''), scheme)
            item = item.get('mblog', {})
            item = item.get('retweeted_status', item)
            text = item.get('text', '')
            title = re.search(ur'【(.+?)】', text)
            title = title.group(1) if title else ''
            if not title:
                title = re.search(ur'#(.+?)#', text)
                title = title.group(1) if title else text[0:20] + '...'
            subtitle = re.search(ur'#(.+?)#', text)
            subtitle = subtitle.group(1) if subtitle else ''
            pubtime = item.get('created_at', '')
            pubtime = HandleContent.strformat(str(pubtime))
            reposts_count = item.get('reposts_count', '')
            comments_count = item.get('comments_count', '')
            attitudes_count = item.get('attitudes_count', '')
            thumbnail_pic = item.get('thumbnail_pic', '')
            bmiddle_pic = item.get('bmiddle_pic', '')
            original_pic = item.get('original_pic', '')
            mid = item.get('mid', '')
            author = item.get('user', {}).get('name', '')
            comment = {}
            comment = {
                'reposts_count': str(reposts_count),
                'attitudes_count': str(attitudes_count),
                'comments_count': str(comments_count)
            }
            crawl_data = {}
            subtitles = []
            subtitles.append(subtitle)
            date = new_time()
            crawl_data = {
                'province': self.data.get('province', ''),
                'city': self.data.get('city', ''),
                'district': self.data.get('district', ''),
                'url': url,
                'title': title,
                'subtitle': subtitles,
                'content': text,
                'pubtime': pubtime,
                'crtime_int': date.get('crtime_int'),
                'crtime': date.get('crtime'),
                'source': data["source"],
                'publisher': self.data.get('publisher', ''),
                'author': author,
                'origin_source': u'新浪微博',
                'comment': comment
            }
            model = WeiboArticleModel(crawl_data)
            if export(model):
                againt_data = {}
                againt_data = {
                    'wid': model['id'],
                    'expire': date.get('crtime_int') / 1000000 + 604800,
                }
                Scheduler.schedule(AgainCrawler.type,
                                   key=mid,
                                   data=againt_data,
                                   reset=True,
                                   interval=21600)
            else:
                pass