def getWeibos(self, keyword, page=1, count=None): url = 'http://t.hexun.com/k/topic.html?type=1&value=%s&pg=%d' % (json.dumps(keyword).replace('\\', '%').replace('"', ''), page) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: infos = result['info'].decode('gb2312') soup = BeautifulSoup(infos) total_soup = soup.select('.headerR1')[0] total_num = total_soup.get_text().split('共')[-1].split('条')[0].strip() return_val = {'total_count': int(total_num), 'msgs':[]} allmsgs = [] msgs_soup = soup.select('.nr_con') for msg_soup in msgs_soup: avatar = 'http://t.hexun.com%s' % msg_soup.select('.nr_conLa > a')[0].get('href') nickandtext = msg_soup.select('.nr_shuo')[0].get_text().split(':') nickname = nickandtext[0] text = nickandtext[1] ts = msg_soup.select('.nr_tan > h3 > a')[0].get_text() allmsgs.append({ 'avatar': avatar, 'nickname': nickname, 'text': text, 'datetime': ts, }) return_val['msgs'] = allmsgs return return_val
def getWeibos(self, keyword, page=1, size=10, sid=None): if not sid: sid = self.sid url = 'http://ti.3g.qq.com/touch/s?sid=%s&aid=vaction&more=1&mst=33&ac=60&keyword=%s&dl2=1&dumpJSON=1&pageid=search&pid=%d&psize=%d' % (sid, keyword, page, size) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: json_info = json.loads(result['info']) if 'result' in json_info and json_info['result'] == '0': msgs = json_info['jsonDump']['msgs'] total_info = json_info['info'] return {'msgs': msgs, 'total_pages': total_info['pageCount'], 'total_count': total_info['totalCount']}
def getSid(self): url = 'http://pt.3g.qq.com/login?act=json&format=2&bid_code=microblogLogin&r=%f&qq=%s&pmd5=%s&go_url=http://ti.3g.qq.com/touch/iphone/index.jsp?g_f=18106' % (random.random(), self.username, self.password) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: info = result['info'].replace('pt.handleLoginResult(', '')[:-2] json_info = json.loads(info) if len(json_info) == 8: sid = json_info[4] self.sid = sid return sid return None
def getWeibos(self, keyword, page=1, count=10): url = 'http://m.weibo.cn/searchs/weibo?key=%s&page=%d&count=%d' % (keyword, page, count) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: infos = result['info'] json_infos = json.loads(infos) if 'ok' in json_infos and json_infos['ok']: return_val = {'total_count': json_infos['total_number'], 'total_pages': json_infos['maxPage'], 'msgs': []} msgs = json_infos['mblogList'] return_val['msgs'] = msgs return return_val
def getSid(self): url = 'http://pt.3g.qq.com/login?act=json&format=2&bid_code=microblogLogin&r=%f&qq=%s&pmd5=%s&go_url=http://ti.3g.qq.com/touch/iphone/index.jsp?g_f=18106' % ( random.random(), self.username, self.password) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: info = result['info'].replace('pt.handleLoginResult(', '')[:-2] json_info = json.loads(info) if len(json_info) == 8: sid = json_info[4] self.sid = sid return sid return None
def getWeibos(self, keyword, page=1, count=10): url = 'http://m.weibo.cn/searchs/weibo?key=%s&page=%d&count=%d' % ( keyword, page, count) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: infos = result['info'] json_infos = json.loads(infos) if 'ok' in json_infos and json_infos['ok']: return_val = { 'total_count': json_infos['total_number'], 'total_pages': json_infos['maxPage'], 'msgs': [] } msgs = json_infos['mblogList'] return_val['msgs'] = msgs return return_val
def getWeibos(self, keyword, page=1, size=10, sid=None): if not sid: sid = self.sid url = 'http://ti.3g.qq.com/touch/s?sid=%s&aid=vaction&more=1&mst=33&ac=60&keyword=%s&dl2=1&dumpJSON=1&pageid=search&pid=%d&psize=%d' % ( sid, keyword, page, size) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: json_info = json.loads(result['info']) if 'result' in json_info and json_info['result'] == '0': msgs = json_info['jsonDump']['msgs'] total_info = json_info['info'] return { 'msgs': msgs, 'total_pages': total_info['pageCount'], 'total_count': total_info['totalCount'] }