Beispiel #1
0
 def validSina(self, url, sp):
     username=''
     if 'blog.sina.com.cn/s/' in url or 'http://blog.sina.com.cn/lm/' in url: return None
     elif 'blog.sina.com.cn/m/' in url:
         pattern = re.compile(r'http://blog.sina.com.cn/m/([^^].*)', re.I)
         res = pattern.findall(url)
         if not res: return None
         username = self.normalizeName(res[0])
     elif 'blog.sina.com.cn/u/' in url:
         mario = Mario()
         response = mario.get(url)
         if not response or not response.body: return None
         html = response.body
         pattern = re.compile(r'uhost : "([^^].*?)"')
         res = pattern.findall(html)
         if not res: return None
         username = res[0]
     elif 'http://blog.sina.com.cn/' in url:
         pattern = re.compile(r'http://blog.sina.com.cn/([^^].*)', re.I)
         res = pattern.findall(url)
         if not res: return None
         username = self.normalizeName(res[0])
         if username in ('main_v5', ): username=None
     if username:
         return (username, 'http://blog.sina.com.cn/%s/'%username, sp)
     return None
Beispiel #2
0
 def check_proxy(self, proxy):
     if not proxy: return None
     url = 'http://www.baidu.com'
     mario = Mario()
     logger.debug('proxy: %s'%proxy)
     res = mario.get(url=url, proxy={'url': proxy})
     return res
Beispiel #3
0
 def get(self, keyword):
     keyword = quote(keyword)
     url = self.url_struct%keyword
     mario = Mario(None)
     response = mario.get(url)
     if response:
         return self.parse(response.body)
     return []
Beispiel #4
0
 def get(self, keyword):
     keyword = keyword.decode('utf-8').encode('gbk')
     keyword = quote(keyword)
     url = self.url_struct%keyword
     mario = Mario(None)
     response = mario.get(url)
     if response:
         return self.parse(response.body)
     return []
Beispiel #5
0
 def __init__(self, url, page=None, debug=False):
     self.url = URL.normalize(url)
     self.page = page
     if not page:
         mario = Mario()
         response = mario.get(self.url)
         if response and response.body:
             self.page = response.body
     self.debug = debug
     bsp = BSP()
     self.bsp_info = bsp.normalize(url)
Beispiel #6
0
 def count(self, query=None, domain=None):
     if not domain: domain = self.domain
     if not query: query = self.query
     query = re.sub(' ', '%20', query)
     url = GoogleSearch.SEARCH_URL%{'domain':domain, 'query':query}
     mario = Mario()
     mario.set_proxies_list(self.proxies)
     response = mario.get(url)
     if not response:
         raise GoogleException('Fail to open page', 502)
     patterns = [re.compile('<p id=resultStats>&nbsp;[^^]*?<b>\d+</b> - <b>\d+</b>[^^]*?<b>([^^]*?)</b>'), re.compile('<p id=resultStats>&nbsp;[^^]*?<b>[^^]*?</b>[^^]*?<b>([^^]*?)</b>[^^]*?<b>\d+</b>-<b>\d+</b>')]
     for pattern in patterns:
         res = pattern.findall(response.body)
         if not res: continue
         return long(re.sub(',', '', res[0]))
     return 0
Beispiel #7
0
 def parser(self, html, sp, homepage):
     if not html: return None
     links = []
     if sp == 'baidu':
         pattern = re.compile('nameEnc: "([^^].*?)"')
         username = pattern.findall(html)
         if not username: return None
         link = 'http://frd.baidu.com/api/friend.getlist?un=%s'%username[0]
         mario = Mario()
         response = mario.get(link)
         if not response or not response.body: return None
         pattern = re.compile('\["([^^].*?)","[^^].*?","[^^].*?","[^^].*?",\d+,"[^^].*?",\d+,\d+\]')
         names = pattern.findall(response.body)
         if not names: return None
         bsp = BSP()
         for n in names:
             u = bsp.normalize('http://hi.baidu.com/sys/checkuser/%s'%n)
             if u and u[1] != homepage and u[1] not in links:
                 links.append(u)
     elif sp == 'sohu':
         pattern = re.compile('"link" : "([^^].*?)"', re.I)
         urls = pattern.findall(html)
         bsp = BSP()
         for url in urls:
             r = bsp.normalize(url)
             if r and r[1] != homepage and r[1] not in links:
                 links.append(r[1])
     elif sp == '163':
         pattern = re.compile('.userName="******"')
         usernames = pattern.findall(html)
         links = []
         bsp = BSP()
         for u in usernames:
             if not u: continue
             link = bsp.valid163(u, 'http:%s.blog.163.com/'%u, '163')
             if link and link[1] and link[1] not in links: links.append(link[1])
     else:
         bsp = BSP()
         for link, title in URL.link_title(html, homepage):
             if not link:
                 continue
             r = bsp.normalize(link)
             if r and r[1] != homepage and r[1] not in links:
                 links.append(r[1])
     return links
Beispiel #8
0
 def _get_page(self, query, page, domain):
     if page == 0:
         if self.number_of_results == 10:
             url = GoogleSearch.SEARCH_URL%{'domain':domain, 'query':query}
         else:
             url = GoogleSearch.SEARCH_URL_WITH_NUMBER%{'domain':domain, 'query':query, 'num':self.number_of_results}
     else:
         if self.number_of_results == 10:
             url = GoogleSearch.NEXT_PAGE%{'domain':domain, 'query':query, 'start':page*self.number_of_results}
         else:
             url = GoogleSearch.NEXT_PAGE_WITH_NUMBER%{'domain':domain, 'query':query, 'num':self.number_of_results, 'start':page*self.number_of_results}
     mario = Mario()
     mario.set_proxies_list(self.proxies)
     response = mario.get(url)
     if not response:
         raise GoogleException('Fail to open page', 502)
     results = self._parse_response(response.body)
     if not results: return []
     return [GoogleResult(result['unescape_url'], result['title'], result['description'], page*self.number_of_results+i+1) for i, result in enumerate(results)]
Beispiel #9
0
 def photobucket(self, concount=CONCOUNT):
     base_url = 'http://www.photobucket.com/images/%s/'%self.keyword
     rss_url = 'http://feed.photobucket.com/images/%s/feed.rss'%self.keyword
     mario = Mario(callback=self.photobucket_callback)
     mario.get(rss_url)
Beispiel #10
0
 def get(url, html):
     url = URL.normalize(url)
     bsp = BSP()
     bsp_info = bsp.normalize(url)
     if not bsp_info: return None
     username, homepage, sp = bsp_info
     mario = Mario()
     if sp == 'sohu':
         pattern = re.compile("var _ebi = '([^^].*?)'")
         res = pattern.findall(html)
         if not res: return None
         response = mario.get("http://blog.sohu.com/action/ebi_%s-m_view-type_profile/widget/"%res[0])
         if not response or not response.body: return None
         pattern = re.compile('<div id="profile_photo">[^^]*?<img src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res: return None
         return res[0]
     elif sp == '163':
         pattern = re.compile("hostName     : '([^^].*?)'")
         hostName = pattern.findall(html)
         if hostName: hostName = hostName[0]
         pattern = re.compile("dataDigest	  : '([^^].*?)'")
         dataDigest = pattern.findall(html)
         if dataDigest: dataDigest = dataDigest[0]
         if not hostName or not dataDigest: return None
         response = mario.get('http://ud3.blog.163.com/%s/%s/modi=1208265646323&mid=0&tid=0&pdm=1/prev.js'%(hostName, dataDigest))
         if not response or not response.body: return None
         pattern = re.compile('<img class=[^^]*?src=[^^]*?"([^^].*?)"')
         res = pattern.findall(response.body)
         if res: return res[0][:-1]
         response = mario.get('http://blog.163.com/%s/profile/'%hostName)
         if not response or not response.body: return None
         pattern = re.compile('<img class="bd01 g_img_00 g_c_hand" src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res: return None
         return res[0]
     elif sp == 'blogcn':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('var[^^]*?blogusername="******"')
         res = pattern.findall(response.body)
         if not res:return None
         response = mario.get('http://userinfo.blogcn.com/%s.shtml'%res[0])
         if not response or not response.body: return None
         pattern = re.compile('<img class="top-5px" src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res: return None
         return res[0]
     elif sp == 'ycool':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<a href="http://www.ycool.com/space.php?uid=([^^].*?)"')
         res = pattern.findall(response.body)
         if not res:return None
         return 'http://ug.ycstatic.com/avatar/%sx96.jpg'%res[0]
     elif sp == 'hexun':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<div id="master_ptoto_1">[^^]*?<script src=\'([^^].*?)\'>')
         res = pattern.findall(response.body)
         if not res:return None
         response = mario.get(res[0])
         if not response or not response.body: return None
         pattern = re.compile("<img src='([^^].*?)'")
         res = pattern.findall(response.body)
         if not res:return None
         return res[0]
     elif sp == 'live':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<div class="cxp_ic_tile_clip"[^^]*?<img[^^]*?src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res:return None
         response = mario.get(urljoin(homepage, 'recent/'))
         if not response or not response.body: return None
         pattern = re.compile('<div class="cxp_ic_tile_clip"[^^]*?<img[^^]*?src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res:return None
         return res[0]
     elif sp == 'blogbus':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<img class="avatar" src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res:return None
         return res[0]
     elif sp == 'sina':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<div id="userImage">[^^]*?<img[^^]*?src="([^^].*?)"')
         res = pattern.findall(response.body)
         if res: return res[0]
         pattern = re.compile('<div class="image">[^^]*?<img[^^]*?src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res: return None
         return res[0]
     elif sp == 'tianya':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<BloggerMemsList>[^^]*?<a href="http://www.tianya.cn/browse/listwriter.asp\?vwriter=([^^].*?)&idWriter=0&Key=0"[^^]*?</a>')
         res = pattern.findall(response.body)
         if not res: return None
         response = mario.get('http://my.tianya.cn/mytianya/ListWriterNew.asp?vwriter=%s'%res[0])
         if not response or not response.body: return None
         pattern = re.compile('<img onload="[^^]*?src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res: return None
         return res[0]
     elif sp == 'baidu':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<div class="portrait">[^^]*?<img src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res: return None
         return res[0]
     elif sp == 'mop':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<div[^^]*?class="fava_box"[^^]*?<img[^^]*?src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res: return None
         return res[0]
Beispiel #11
0
 def get(self):
     if not self.bsp_info:
         logger.debug('Not a valid bsp')
         return None
     if not self.page:
         logger.debug("Cant't fetch content.")
         return None
     html = self.page
     username, homepage, sp  = self.bsp_info
     links_url = None
     if sp not in ('tianya', 'ycool', 'blogcn', '163', 'cnblogs', 'sina', 'live', 'blogbus', 'baidu', 'hexun', 'sohu'):
         return None
     if sp == 'sohu':
         pattern = re.compile("var _ebi = '([^^].*?)'")
         res = pattern.findall(html)
         if res: links_url = 'http://blog.sohu.com/sff/links/%s.html'%res[0]
     elif sp == '163':
         pattern = re.compile("hostName     : '([^^].*?)'")
         hostNameRes = pattern.findall(html)
         if hostNameRes: hostName = hostNameRes[0]
         pattern = re.compile("dataDigest	  : '([^^].*?)'")
         dataDigest = pattern.findall(html)
         if dataDigest: dataDigest = dataDigest[0]
         if hostNameRes and dataDigest: 
             link = 'http://%s.blog.163.com/friends/dwr/call/plaincall/UserBean.getFriends.dwr'%hostName
             mario = Mario()
             body = [('callCount', '1'), ('scriptSessionId', '${scriptSessionId}561'), ('c0-scriptName', 'UserBean'), ('c0-methodName','getFriends'), ('c0-id', 0), ('c0-param0', 'boolean:false'), ('c0-param1', 'number:0'), ('c0-param2', 'number:0'), ('c0-param3', 'number:20'), ('batchId', 0),]
             response = mario.get(link, body=urlencode(body))
             if response and response.body: html = response.body
     elif sp == 'baidu':
         links_url = urljoin(homepage, 'friends')
     elif sp == 'hexun':
         html = ''
         friend_links = []
         page = 1
         results = []
         def callback(response):
             if response and response.body: results.append(response.body)
         while True:
             mario = Mario()
             response = mario.get('http://hexun.com/%s/%d/t0/friends.html'%(username, page))
             if not response or not response.body: break;
             friendsPage = response.body
             pattern = re.compile('<!--  朋友列表:开始  -->[^^]*?<!--  朋友列表:结束  -->')
             if friendsPage: dom = pattern.findall(friendsPage)
             if not friendsPage or not dom: break
             pattern = re.compile('<div class="FriendTableList_2_1_1"><a href="/([^^].*?)/default.html"', re.I)
             ids = pattern.findall(dom[0])
             if not ids: break
             has_friend_link = False
             results = []
             mario = MarioBatch(callback=callback)
             for friend_id in ids:
                 mario.add_job('http://hexun.com/%s/default.html'%friend_id)
             mario(5)
             if not results: break
             pattern = re.compile('blogname=([^^].*?)&preview=', re.I)
             for f in results:
                 if not f: continue
                 res = pattern.findall(f)
                 if not res: continue
                 friend_links.append('<a href="http://%s.blog.hexun.com/">link</a>'%res[0])
                 has_friend_link = True
             if not has_friend_link: break
             page += 1
         html = ','.join(friend_links)
     elif sp == 'blogcn':
         bsp = BSP()
         nu = bsp.normalize(homepage)
         if nu!=homepage:
             mario = Mario()
             response = mario.get(nu[1])
             if response and response.body: html = response.body
     if links_url:
         html = '' 
         mario = Mario()
         response = mario.get(links_url)
         if response and response.body:
             html = response.body
     return self.parser(html, sp, homepage)