def validSina(self, url, sp): username='' if 'blog.sina.com.cn/s/' in url or 'http://blog.sina.com.cn/lm/' in url: return None elif 'blog.sina.com.cn/m/' in url: pattern = re.compile(r'http://blog.sina.com.cn/m/([^^].*)', re.I) res = pattern.findall(url) if not res: return None username = self.normalizeName(res[0]) elif 'blog.sina.com.cn/u/' in url: mario = Mario() response = mario.get(url) if not response or not response.body: return None html = response.body pattern = re.compile(r'uhost : "([^^].*?)"') res = pattern.findall(html) if not res: return None username = res[0] elif 'http://blog.sina.com.cn/' in url: pattern = re.compile(r'http://blog.sina.com.cn/([^^].*)', re.I) res = pattern.findall(url) if not res: return None username = self.normalizeName(res[0]) if username in ('main_v5', ): username=None if username: return (username, 'http://blog.sina.com.cn/%s/'%username, sp) return None
def check_proxy(self, proxy): if not proxy: return None url = 'http://www.baidu.com' mario = Mario() logger.debug('proxy: %s'%proxy) res = mario.get(url=url, proxy={'url': proxy}) return res
def get(self, keyword): keyword = quote(keyword) url = self.url_struct%keyword mario = Mario(None) response = mario.get(url) if response: return self.parse(response.body) return []
def get(self, keyword): keyword = keyword.decode('utf-8').encode('gbk') keyword = quote(keyword) url = self.url_struct%keyword mario = Mario(None) response = mario.get(url) if response: return self.parse(response.body) return []
def __init__(self, url, page=None, debug=False): self.url = URL.normalize(url) self.page = page if not page: mario = Mario() response = mario.get(self.url) if response and response.body: self.page = response.body self.debug = debug bsp = BSP() self.bsp_info = bsp.normalize(url)
def count(self, query=None, domain=None): if not domain: domain = self.domain if not query: query = self.query query = re.sub(' ', '%20', query) url = GoogleSearch.SEARCH_URL%{'domain':domain, 'query':query} mario = Mario() mario.set_proxies_list(self.proxies) response = mario.get(url) if not response: raise GoogleException('Fail to open page', 502) patterns = [re.compile('<p id=resultStats> [^^]*?<b>\d+</b> - <b>\d+</b>[^^]*?<b>([^^]*?)</b>'), re.compile('<p id=resultStats> [^^]*?<b>[^^]*?</b>[^^]*?<b>([^^]*?)</b>[^^]*?<b>\d+</b>-<b>\d+</b>')] for pattern in patterns: res = pattern.findall(response.body) if not res: continue return long(re.sub(',', '', res[0])) return 0
def parser(self, html, sp, homepage): if not html: return None links = [] if sp == 'baidu': pattern = re.compile('nameEnc: "([^^].*?)"') username = pattern.findall(html) if not username: return None link = 'http://frd.baidu.com/api/friend.getlist?un=%s'%username[0] mario = Mario() response = mario.get(link) if not response or not response.body: return None pattern = re.compile('\["([^^].*?)","[^^].*?","[^^].*?","[^^].*?",\d+,"[^^].*?",\d+,\d+\]') names = pattern.findall(response.body) if not names: return None bsp = BSP() for n in names: u = bsp.normalize('http://hi.baidu.com/sys/checkuser/%s'%n) if u and u[1] != homepage and u[1] not in links: links.append(u) elif sp == 'sohu': pattern = re.compile('"link" : "([^^].*?)"', re.I) urls = pattern.findall(html) bsp = BSP() for url in urls: r = bsp.normalize(url) if r and r[1] != homepage and r[1] not in links: links.append(r[1]) elif sp == '163': pattern = re.compile('.userName="******"') usernames = pattern.findall(html) links = [] bsp = BSP() for u in usernames: if not u: continue link = bsp.valid163(u, 'http:%s.blog.163.com/'%u, '163') if link and link[1] and link[1] not in links: links.append(link[1]) else: bsp = BSP() for link, title in URL.link_title(html, homepage): if not link: continue r = bsp.normalize(link) if r and r[1] != homepage and r[1] not in links: links.append(r[1]) return links
def _get_page(self, query, page, domain): if page == 0: if self.number_of_results == 10: url = GoogleSearch.SEARCH_URL%{'domain':domain, 'query':query} else: url = GoogleSearch.SEARCH_URL_WITH_NUMBER%{'domain':domain, 'query':query, 'num':self.number_of_results} else: if self.number_of_results == 10: url = GoogleSearch.NEXT_PAGE%{'domain':domain, 'query':query, 'start':page*self.number_of_results} else: url = GoogleSearch.NEXT_PAGE_WITH_NUMBER%{'domain':domain, 'query':query, 'num':self.number_of_results, 'start':page*self.number_of_results} mario = Mario() mario.set_proxies_list(self.proxies) response = mario.get(url) if not response: raise GoogleException('Fail to open page', 502) results = self._parse_response(response.body) if not results: return [] return [GoogleResult(result['unescape_url'], result['title'], result['description'], page*self.number_of_results+i+1) for i, result in enumerate(results)]
def photobucket(self, concount=CONCOUNT): base_url = 'http://www.photobucket.com/images/%s/'%self.keyword rss_url = 'http://feed.photobucket.com/images/%s/feed.rss'%self.keyword mario = Mario(callback=self.photobucket_callback) mario.get(rss_url)
def get(url, html): url = URL.normalize(url) bsp = BSP() bsp_info = bsp.normalize(url) if not bsp_info: return None username, homepage, sp = bsp_info mario = Mario() if sp == 'sohu': pattern = re.compile("var _ebi = '([^^].*?)'") res = pattern.findall(html) if not res: return None response = mario.get("http://blog.sohu.com/action/ebi_%s-m_view-type_profile/widget/"%res[0]) if not response or not response.body: return None pattern = re.compile('<div id="profile_photo">[^^]*?<img src="([^^].*?)"') res = pattern.findall(response.body) if not res: return None return res[0] elif sp == '163': pattern = re.compile("hostName : '([^^].*?)'") hostName = pattern.findall(html) if hostName: hostName = hostName[0] pattern = re.compile("dataDigest : '([^^].*?)'") dataDigest = pattern.findall(html) if dataDigest: dataDigest = dataDigest[0] if not hostName or not dataDigest: return None response = mario.get('http://ud3.blog.163.com/%s/%s/modi=1208265646323&mid=0&tid=0&pdm=1/prev.js'%(hostName, dataDigest)) if not response or not response.body: return None pattern = re.compile('<img class=[^^]*?src=[^^]*?"([^^].*?)"') res = pattern.findall(response.body) if res: return res[0][:-1] response = mario.get('http://blog.163.com/%s/profile/'%hostName) if not response or not response.body: return None pattern = re.compile('<img class="bd01 g_img_00 g_c_hand" src="([^^].*?)"') res = pattern.findall(response.body) if not res: return None return res[0] elif sp == 'blogcn': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('var[^^]*?blogusername="******"') res = pattern.findall(response.body) if not res:return None response = mario.get('http://userinfo.blogcn.com/%s.shtml'%res[0]) if not response or not response.body: return None pattern = re.compile('<img class="top-5px" src="([^^].*?)"') res = pattern.findall(response.body) if not res: return None return res[0] elif sp == 'ycool': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<a href="http://www.ycool.com/space.php?uid=([^^].*?)"') res = pattern.findall(response.body) if not res:return None return 'http://ug.ycstatic.com/avatar/%sx96.jpg'%res[0] elif sp == 'hexun': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<div id="master_ptoto_1">[^^]*?<script src=\'([^^].*?)\'>') res = pattern.findall(response.body) if not res:return None response = mario.get(res[0]) if not response or not response.body: return None pattern = re.compile("<img src='([^^].*?)'") res = pattern.findall(response.body) if not res:return None return res[0] elif sp == 'live': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<div class="cxp_ic_tile_clip"[^^]*?<img[^^]*?src="([^^].*?)"') res = pattern.findall(response.body) if not res:return None response = mario.get(urljoin(homepage, 'recent/')) if not response or not response.body: return None pattern = re.compile('<div class="cxp_ic_tile_clip"[^^]*?<img[^^]*?src="([^^].*?)"') res = pattern.findall(response.body) if not res:return None return res[0] elif sp == 'blogbus': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<img class="avatar" src="([^^].*?)"') res = pattern.findall(response.body) if not res:return None return res[0] elif sp == 'sina': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<div id="userImage">[^^]*?<img[^^]*?src="([^^].*?)"') res = pattern.findall(response.body) if res: return res[0] pattern = re.compile('<div class="image">[^^]*?<img[^^]*?src="([^^].*?)"') res = pattern.findall(response.body) if not res: return None return res[0] elif sp == 'tianya': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<BloggerMemsList>[^^]*?<a href="http://www.tianya.cn/browse/listwriter.asp\?vwriter=([^^].*?)&idWriter=0&Key=0"[^^]*?</a>') res = pattern.findall(response.body) if not res: return None response = mario.get('http://my.tianya.cn/mytianya/ListWriterNew.asp?vwriter=%s'%res[0]) if not response or not response.body: return None pattern = re.compile('<img onload="[^^]*?src="([^^].*?)"') res = pattern.findall(response.body) if not res: return None return res[0] elif sp == 'baidu': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<div class="portrait">[^^]*?<img src="([^^].*?)"') res = pattern.findall(response.body) if not res: return None return res[0] elif sp == 'mop': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<div[^^]*?class="fava_box"[^^]*?<img[^^]*?src="([^^].*?)"') res = pattern.findall(response.body) if not res: return None return res[0]
def get(self): if not self.bsp_info: logger.debug('Not a valid bsp') return None if not self.page: logger.debug("Cant't fetch content.") return None html = self.page username, homepage, sp = self.bsp_info links_url = None if sp not in ('tianya', 'ycool', 'blogcn', '163', 'cnblogs', 'sina', 'live', 'blogbus', 'baidu', 'hexun', 'sohu'): return None if sp == 'sohu': pattern = re.compile("var _ebi = '([^^].*?)'") res = pattern.findall(html) if res: links_url = 'http://blog.sohu.com/sff/links/%s.html'%res[0] elif sp == '163': pattern = re.compile("hostName : '([^^].*?)'") hostNameRes = pattern.findall(html) if hostNameRes: hostName = hostNameRes[0] pattern = re.compile("dataDigest : '([^^].*?)'") dataDigest = pattern.findall(html) if dataDigest: dataDigest = dataDigest[0] if hostNameRes and dataDigest: link = 'http://%s.blog.163.com/friends/dwr/call/plaincall/UserBean.getFriends.dwr'%hostName mario = Mario() body = [('callCount', '1'), ('scriptSessionId', '${scriptSessionId}561'), ('c0-scriptName', 'UserBean'), ('c0-methodName','getFriends'), ('c0-id', 0), ('c0-param0', 'boolean:false'), ('c0-param1', 'number:0'), ('c0-param2', 'number:0'), ('c0-param3', 'number:20'), ('batchId', 0),] response = mario.get(link, body=urlencode(body)) if response and response.body: html = response.body elif sp == 'baidu': links_url = urljoin(homepage, 'friends') elif sp == 'hexun': html = '' friend_links = [] page = 1 results = [] def callback(response): if response and response.body: results.append(response.body) while True: mario = Mario() response = mario.get('http://hexun.com/%s/%d/t0/friends.html'%(username, page)) if not response or not response.body: break; friendsPage = response.body pattern = re.compile('<!-- 朋友列表:开始 -->[^^]*?<!-- 朋友列表:结束 -->') if friendsPage: dom = pattern.findall(friendsPage) if not friendsPage or not dom: break pattern = re.compile('<div class="FriendTableList_2_1_1"><a href="/([^^].*?)/default.html"', re.I) ids = pattern.findall(dom[0]) if not ids: break has_friend_link = False results = [] mario = MarioBatch(callback=callback) for friend_id in ids: mario.add_job('http://hexun.com/%s/default.html'%friend_id) mario(5) if not results: break pattern = re.compile('blogname=([^^].*?)&preview=', re.I) for f in results: if not f: continue res = pattern.findall(f) if not res: continue friend_links.append('<a href="http://%s.blog.hexun.com/">link</a>'%res[0]) has_friend_link = True if not has_friend_link: break page += 1 html = ','.join(friend_links) elif sp == 'blogcn': bsp = BSP() nu = bsp.normalize(homepage) if nu!=homepage: mario = Mario() response = mario.get(nu[1]) if response and response.body: html = response.body if links_url: html = '' mario = Mario() response = mario.get(links_url) if response and response.body: html = response.body return self.parser(html, sp, homepage)