def photobucket_callback(self, response): try: rss = feedparser.parse(response.body) except: return for entry in rss['entries']: self.urls.append( (URL.normalize(entry.guid), entry.title) )
def matched_feed(self, response): links = self.mario.link_title_db.dic for k, v in links.iteritems(): link = URL.normalize(k) if link in (response.url, response.effective_url): return v[0][2] return None
def next_depth(self, response): #with_timeout(1, self.lightcloud.set, LightCloud.crawled_url_key(response.effective_url), response.url, timeout_value=None) for link, title in URL.link_title(response.body, response.effective_url): if not self.inject_url(link, response.args):continue self.link_title_db.add(link, response.effective_url, title) if callable(self.callback): self.callback(response) self.crawled[response.effective_url] = 2 if response.effective_url != response.url: self.crawled[response.url] = 2 self.referer = response.effective_url
def __init__(self, url, page=None, debug=False): self.url = URL.normalize(url) self.page = page if not page: mario = Mario() response = mario.get(self.url) if response and response.body: self.page = response.body self.debug = debug bsp = BSP() self.bsp_info = bsp.normalize(url)
def parser(self, html, sp, homepage): if not html: return None links = [] if sp == 'baidu': pattern = re.compile('nameEnc: "([^^].*?)"') username = pattern.findall(html) if not username: return None link = 'http://frd.baidu.com/api/friend.getlist?un=%s'%username[0] mario = Mario() response = mario.get(link) if not response or not response.body: return None pattern = re.compile('\["([^^].*?)","[^^].*?","[^^].*?","[^^].*?",\d+,"[^^].*?",\d+,\d+\]') names = pattern.findall(response.body) if not names: return None bsp = BSP() for n in names: u = bsp.normalize('http://hi.baidu.com/sys/checkuser/%s'%n) if u and u[1] != homepage and u[1] not in links: links.append(u) elif sp == 'sohu': pattern = re.compile('"link" : "([^^].*?)"', re.I) urls = pattern.findall(html) bsp = BSP() for url in urls: r = bsp.normalize(url) if r and r[1] != homepage and r[1] not in links: links.append(r[1]) elif sp == '163': pattern = re.compile('.userName="******"') usernames = pattern.findall(html) links = [] bsp = BSP() for u in usernames: if not u: continue link = bsp.valid163(u, 'http:%s.blog.163.com/'%u, '163') if link and link[1] and link[1] not in links: links.append(link[1]) else: bsp = BSP() for link, title in URL.link_title(html, homepage): if not link: continue r = bsp.normalize(link) if r and r[1] != homepage and r[1] not in links: links.append(r[1]) return links
def flickr(self, flickr_api_key, depth=5): api_key = flickr_api_key total_pages = depth url_form = 'http://%(farm_id)s.static.flickr.com/%(server_id)s/%(id)s_%(secret)s_b.jpg' flickr = flickrapi.FlickrAPI(api_key) cur_page = 1 while cur_page <= depth and cur_page <= total_pages: try: rsp = flickr.photos_search(text=self.keyword, media='photos', per_page='10', page=cur_page) except: total_pages=0 logger.error(Traceback()) continue total_pages = rsp[0].attrib['pages'] photos = rsp.find('photos') for photo in photos: self.urls.append( (URL.normalize(url_form%{'farm_id':photo.attrib['farm'], 'server_id':photo.attrib['server'], 'id':photo.attrib['id'], 'secret':photo.attrib['secret']}), photo.attrib['title']) ) cur_page += 1
def __init__(self, starturl, identifier=None, accept_url_patterns=[], reject_url_patterns=[], analysis=False, verbose=False): starturl = URL.normalize(starturl) self.analysis = analysis self.mixed = 0 if not identifier: identifier = md5(starturl).hexdigest() super(Warehouse, self).__init__(starturl, identifier=identifier, accept_url_patterns=accept_url_patterns, reject_url_patterns=reject_url_patterns, analysis=analysis, verbose=verbose) bsp = BSP() bsp_pac = bsp.get_pac(starturl) pac = None if bsp_pac: pac = bsp_pac if not Site().one({"url_hash": identifier}): site = New(Site()) site.url = starturl if isinstance(starturl, unicode) else starturl.decode('utf-8') site.url_hash = identifier if isinstance(identifier, unicode) else identifier.decode('utf-8') site.inserted_at = datetime.utcnow() site.last_updated_at = datetime.utcnow() site.save()
def __init__(self, starturl, callback, callpre=None, callfail=None, concount=MAXCONCOUNT, depth=2, accept_url_patterns=None, reject_url_patterns=None): self.concount = concount self.callback = callback self.callpre = callpre self.callfail = callfail self.depth = depth self.starturl = starturl self.baseurl = URL.baseurl(starturl) self.urls = [] self.crawled = {} self.link_title_db = LinkTitleDB() self.accept_url_patterns = accept_url_patterns self.reject_url_patterns = reject_url_patterns self.robotstxt = RobotFileParser() self.robotstxt.set_url(urljoin(starturl, '/robots.txt')) self.referer = starturl try: self.robotstxt.read() except: logger.debug(Traceback())
def normalize(self, url): url = URL.normalize(url) tmp = url.split('?') for b in self.support_bsps: pattern = re.compile(b[0], re.I) res = pattern.findall(url) if res: name = self.normalizeName(res[0]) if len(tmp)>1 and b[2] in tmp[1] and 'http' in tmp[1] or name in b[1]: continue if b[2] == 'tianya': return self.validTianya(name, b[2]) if b[2] == 'ycool': return self.validYcool(name, b[2]) if b[2] == 'blogcn': return self.validBlogcn(name, b[2]) if b[2] == '163': return self.valid163(name, url, b[2]) if b[2] == 'cnblogs': return self.validCnblogs(name, b[2]) if b[2] == 'sina': return self.validSina(url, b[2]) if b[2] == 'live': return self.validLive(name, url, b[2]) if b[2] == 'blogbus': return self.validBlogbus(name, b[2]) if b[2] == 'baidu': return self.validBaidu(name, url, b[2]) if b[2] == 'hexun': return self.validHexun(name, b[2]) if b[2] == 'sohu': return self.validSohu(name, b[2]) if b[2] == 'mop': return self.validMop(name, b[2]) return None
def _handle_response_header(self, c): """Handle the response. This method decodes the response to unicode and checks for any error condition. It additionally adds a C{Statistics} item to the response which contains upload & download times. @type c: PycURL C{Curl} @param c: a completed connection @return: a dictionary of results corresponding to the response @raise MarioException: if an error exists in the response """ code = c.getinfo(c.HTTP_CODE) if c.errstr() == '' and c.getinfo(pycurl.RESPONSE_CODE) in STATUS_OK or code == 200: effective_url = c.getinfo(pycurl.EFFECTIVE_URL) size = int(c.getinfo(pycurl.CONTENT_LENGTH_DOWNLOAD)) else: if callable(self.callfail): self.callfail(c.url) raise HTTPException(c.errstr(), code) return None #if self.check_duplicate and URL.been_inserted(effective_url, self.lightcloud): return None return URL.normalize(effective_url)
def __init__(self, starturl, identifier=None, verbose=False): starturl = URL.normalize(starturl) self.mixed = 1 if not identifier: identifier = md5(starturl).hexdigest() super(WarehouseRss, self).__init__(starturl, identifier=identifier, verbose=verbose)
def get(url, html): url = URL.normalize(url) bsp = BSP() bsp_info = bsp.normalize(url) if not bsp_info: return None username, homepage, sp = bsp_info mario = Mario() if sp == 'sohu': pattern = re.compile("var _ebi = '([^^].*?)'") res = pattern.findall(html) if not res: return None response = mario.get("http://blog.sohu.com/action/ebi_%s-m_view-type_profile/widget/"%res[0]) if not response or not response.body: return None pattern = re.compile('<div id="profile_photo">[^^]*?<img src="([^^].*?)"') res = pattern.findall(response.body) if not res: return None return res[0] elif sp == '163': pattern = re.compile("hostName : '([^^].*?)'") hostName = pattern.findall(html) if hostName: hostName = hostName[0] pattern = re.compile("dataDigest : '([^^].*?)'") dataDigest = pattern.findall(html) if dataDigest: dataDigest = dataDigest[0] if not hostName or not dataDigest: return None response = mario.get('http://ud3.blog.163.com/%s/%s/modi=1208265646323&mid=0&tid=0&pdm=1/prev.js'%(hostName, dataDigest)) if not response or not response.body: return None pattern = re.compile('<img class=[^^]*?src=[^^]*?"([^^].*?)"') res = pattern.findall(response.body) if res: return res[0][:-1] response = mario.get('http://blog.163.com/%s/profile/'%hostName) if not response or not response.body: return None pattern = re.compile('<img class="bd01 g_img_00 g_c_hand" src="([^^].*?)"') res = pattern.findall(response.body) if not res: return None return res[0] elif sp == 'blogcn': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('var[^^]*?blogusername="******"') res = pattern.findall(response.body) if not res:return None response = mario.get('http://userinfo.blogcn.com/%s.shtml'%res[0]) if not response or not response.body: return None pattern = re.compile('<img class="top-5px" src="([^^].*?)"') res = pattern.findall(response.body) if not res: return None return res[0] elif sp == 'ycool': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<a href="http://www.ycool.com/space.php?uid=([^^].*?)"') res = pattern.findall(response.body) if not res:return None return 'http://ug.ycstatic.com/avatar/%sx96.jpg'%res[0] elif sp == 'hexun': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<div id="master_ptoto_1">[^^]*?<script src=\'([^^].*?)\'>') res = pattern.findall(response.body) if not res:return None response = mario.get(res[0]) if not response or not response.body: return None pattern = re.compile("<img src='([^^].*?)'") res = pattern.findall(response.body) if not res:return None return res[0] elif sp == 'live': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<div class="cxp_ic_tile_clip"[^^]*?<img[^^]*?src="([^^].*?)"') res = pattern.findall(response.body) if not res:return None response = mario.get(urljoin(homepage, 'recent/')) if not response or not response.body: return None pattern = re.compile('<div class="cxp_ic_tile_clip"[^^]*?<img[^^]*?src="([^^].*?)"') res = pattern.findall(response.body) if not res:return None return res[0] elif sp == 'blogbus': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<img class="avatar" src="([^^].*?)"') res = pattern.findall(response.body) if not res:return None return res[0] elif sp == 'sina': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<div id="userImage">[^^]*?<img[^^]*?src="([^^].*?)"') res = pattern.findall(response.body) if res: return res[0] pattern = re.compile('<div class="image">[^^]*?<img[^^]*?src="([^^].*?)"') res = pattern.findall(response.body) if not res: return None return res[0] elif sp == 'tianya': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<BloggerMemsList>[^^]*?<a href="http://www.tianya.cn/browse/listwriter.asp\?vwriter=([^^].*?)&idWriter=0&Key=0"[^^]*?</a>') res = pattern.findall(response.body) if not res: return None response = mario.get('http://my.tianya.cn/mytianya/ListWriterNew.asp?vwriter=%s'%res[0]) if not response or not response.body: return None pattern = re.compile('<img onload="[^^]*?src="([^^].*?)"') res = pattern.findall(response.body) if not res: return None return res[0] elif sp == 'baidu': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<div class="portrait">[^^]*?<img src="([^^].*?)"') res = pattern.findall(response.body) if not res: return None return res[0] elif sp == 'mop': response = mario.get(homepage) if not response or not response.body: return None pattern = re.compile('<div[^^]*?class="fava_box"[^^]*?<img[^^]*?src="([^^].*?)"') res = pattern.findall(response.body) if not res: return None return res[0]
def reject_url(self, url): return self.baseurl != URL.baseurl(url) and (not self.accept_url_patterns or not re.match('|'.join(self.accept_url_patterns), url) or self.reject_url_patterns or re.match('|'.join(self.reject_url_patterns), url))
def get_rss_url(self, starturl, etag=None, last_modified=None, proxy=None): mario = Mario(referer=starturl, etag=etag, last_modified=last_modified, proxy=proxy) response = mario.get(starturl) if not response: return None return URL.rss_link(starturl, response.body)
def connect(self, url, body=None, headers=HEADERS, normalize=True, args=None): url = URL.normalize(url, normalize) #if self.check_duplicate and URL.been_inserted(url, self.lightcloud): return None if callable(self.callpre): self.callpre(url) c = pycurl.Curl() if headers: if self.user_agent: headers.setdefault('User-Agent', self.user_agent) else: headers.setdefault('User-Agent', self.random_user_agent()) header_list = [] for header_name, header_value in headers.iteritems(): header_list.append('%s: %s' % (header_name, header_value)) if self.last_modified: header_list.append('%s: %s' % ('If-Modified-Since', self.last_modified)) if self.etag: header_list.append('%s: %s' % ('ETag', self.etag)) if header_list: c.setopt(pycurl.HTTPHEADER, header_list) #c.setopt(c.USERAGENT, self.user_agent) # Presence of a body indicates that we should do a POST if self.post_body: body = self.post_body if self.login: body = self.login if body is not None: logger.debug('post') body = urlencode(body) c.setopt(pycurl.POST, 1) c.setopt(pycurl.POSTFIELDS, body) else: c.setopt(pycurl.HTTPGET, 1) c.url = url c.args = args c.setopt(pycurl.ENCODING, 'gzip, deflate') c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 10) c.setopt(pycurl.CONNECTTIMEOUT, 30) c.setopt(pycurl.TIMEOUT, self.timeout) c.setopt(pycurl.NOSIGNAL, 1) c.response = StringIO() c.header_data = StringIO() c.setopt(pycurl.WRITEFUNCTION, c.response.write) c.setopt(pycurl.HEADERFUNCTION, c.header_data.write) try: c.setopt(pycurl.URL, URL.quote(url)) except: return None if self.cookies: cookies = self.cookies else: cookies = self.parse_cookies(c) if cookies: c.setopt(pycurl.COOKIELIST, '') chunks = [] for key, value in cookies.iteritems(): key = quote_plus(key) value = quote_plus(value) chunks.append('%s=%s;' % (key, value)) c.setopt(pycurl.COOKIE, ''.join(chunks)) else: cookie_file_name = os.tempnam() c.setopt(pycurl.COOKIEFILE, cookie_file_name) c.setopt(pycurl.COOKIEJAR, cookie_file_name) if self.referer: c.setopt(pycurl.REFERER, self.referer) if self.verbose: c.setopt(pycurl.VERBOSE, True) c.setopt(pycurl.DEBUGFUNCTION, self.verbose) if self.progress: c.setopt(pycurl.NOPROGRESS, False) c.setopt(pycurl.PROGRESSFUNCTION, self.progress) if self.proxies: self.proxy = random.choice(self.proxies) if self.proxy: if isinstance(self.proxy, (str, unicode)): proxy = self.proxy else: proxy = self.proxy['url'] if 'userpwd' in self.proxy: c.setopt(pycurl.PROXYUSERPWD, self.proxy['proxy_userpwd']) if 'type' in self.proxy: ptype = getattr(pycurl, 'PROXYTYPE_%s' % self.proxy['type'].upper()) c.setopt(pycurl.PROXYTYPE, ptype) c.setopt(pycurl.PROXY, proxy) if not self.secure: c.setopt(pycurl.SSL_VERIFYPEER, False) c.setopt(pycurl.SSL_VERIFYHOST, False) logger.debug('connected to %r'%url) return c
body = body.decode(encoding).encode('utf-8') elif charset and charset['encoding'] and charset['encoding'].lower()!='iso-8859-2': pattern = re.compile('<meta http-equiv="Content-Type" content="text/html; charset=([^^].*?)"', re.I|re.S) encoding = pattern.findall(body) if encoding: encoding = encoding[0].lower() if encoding in ALT_CODECS: encoding = ALT_CODECS[encoding] if encoding.lower()!='iso-8859-2' and encoding.lower()!='utf-8': body = body.decode(encoding).encode('utf-8') except UnicodeDecodeError, err: body = body.decode(encoding, "replace").encode('utf-8') #if callable(self.callfail): self.callfail(effective_url) #logger.error('Encoding error: %r'%c.url) #logger.error(err) #return None response = HTTPResponse(url=c.url, effective_url=URL.normalize(effective_url), size=size, code=code, body=body, etag = Etag, last_modified = Last_Modified, args=c.args) logger.debug(response) try: if callable(self.callback): self.callback(response) return response except: if callable(self.callfail): self.callfail(effective_url) logger.error('Error: %r'%Traceback()) return None def _handle_response_header(self, c): """Handle the response. This method decodes the response to unicode and checks for any error condition. It additionally adds a C{Statistics} item to the response which contains upload & download times.