Ejemplo n.º 1
0
 def crawl(self):
     key = self.key
     data = self.data
     homepage = "http://card.weibo.com/article/aj/articleshow?cid=" + key
     url = "http://weibo.com/p/" + key
     html_stream = _get_url(homepage)
     json_stream = change_to_json(str(html_stream.text))
     html_stream = json_stream['data']['article']
     soup = HandleContent.get_BScontext(html_stream, text=True)
     title = soup.select('.title')[0].text
     pubtime = soup.select('.time')[0].text
     pubtime = HandleContent.strformat(str(pubtime))
     content = soup.select('.WBA_content')[0]
     content = clear_label(list(content))
     comment = {}
     text = HandleContent.get_BScontext(content, text=True).text
     comment['content'] = clear_space(text)
     publishers = soup.select('.S_link2')
     # author = reduce(lambda x, y: x + y, [item.text for item in authors])
     try:
         publisher = publishers[1].text if len(
             publishers) > 1 else publishers[0].text
     except:
         publisher = ''
     crawl_data = {}
     date = new_time()
     crawl_data = {
         'title': title,
         'pubtime': pubtime,
         'source': 'weibo',
         'publisher': publisher,
         'crtime_int': date.get('crtime_int'),
         'crtime': date.get('crtime'),
         'origin_source': u'微博搜索',
         'url': url,
         'key': data.get('key', ''),
         'type': u'元搜索',
         'source_type': data.get('source_type', ''),
         'content': content,
         'comment': comment,
     }
     model = SearchArticleModel(crawl_data)
     export(model)
Ejemplo n.º 2
0
 def crawl(self):
     key = self.key
     data = self.data
     homepage = "http://card.weibo.com/article/aj/articleshow?cid="+ key
     url = "http://weibo.com/p/"+ key
     html_stream = _get_url(homepage)    
     json_stream = change_to_json(str(html_stream.text))
     html_stream = json_stream['data']['article']
     soup = HandleContent.get_BScontext(html_stream, text=True)
     title = soup.select('.title')[0].text
     pubtime = soup.select('.time')[0].text
     pubtime = HandleContent.strformat(str(pubtime))
     content = soup.select('.WBA_content')[0]
     content = clear_label(list(content))
     comment = {}
     text = HandleContent.get_BScontext(content, text=True).text
     comment['content'] = clear_space(text)
     publishers = soup.select('.S_link2')
     # author = reduce(lambda x, y: x + y, [item.text for item in authors])
     try:
         publisher = publishers[1].text if len(publishers)> 1 else publishers[0].text
     except:
         publisher = ''
     crawl_data = {}
     date = new_time()
     crawl_data = {
         'title': title,
         'pubtime': pubtime,
         'source': 'weibo',
         'publisher': publisher,
         'crtime_int': date.get('crtime_int'),
         'crtime': date.get('crtime'),
         'origin_source': u'微博搜索',
         'url': url,
         'key': data.get('key', ''),
         'type': u'元搜索',
         'source_type': data.get('source_type', ''),
         'content': content,
         'comment': comment,
     }
     model = SearchArticleModel(crawl_data)
     export(model)