def fetch_categorys(self): category_urls = [ "http://www.eslite.com/category.aspx?cate=80", #中文 "http://www.eslite.com/category.aspx?cate=156", #外文 "http://www.eslite.com/category.aspx?cate=44" #儿童 ] categorys = [] for category_url in category_urls: self.snoopy.fetch(category_url) html = self.snoopy.results reg_pattern = re.compile("\r") html = str_repalce(html, reg_pattern, "") reg_pattern = re.compile("\n") html = str_repalce(html, reg_pattern, "") reg_pattern = re.compile(r'<a href="(newbook_list.aspx?.*?)">(.*?)</a>') category_strs = reg_pattern.findall(html) for category_str in category_strs: try: category = self.init_category_format() params = url_decode(category_str[0]) category["cate"] = params["cate"] category["sub"] = params["sub"] category["list"] = params["list"] category["text"] = category_str[1].strip().decode("utf8") categorys.append(category) except: pass return categorys
def parseMiscInfo(el): """ 提取微博的评论数、转发数以及发表时间和来源,从微博地址获取mid """ e = el.xpath("./*[@class='info W_linkb W_textb']")[0] d = { 'ccount': 0, 'rcount': 0, 'source': '', } al = e.xpath("./span/a") for a in al: if a.get('action-type', '') == 'feed_list_forward': d['rcount'] = NodeService.getCount(a.text) if a.get('action-type', '') == 'feed_list_comment': d['ccount'] = NodeService.getCount(a.text) d['cdate'] = int( e.xpath("./a[@node-type='feed_list_item_date']/@date")[0]) hrefl = unicode( e.xpath("./a[@node-type='feed_list_item_date']/@href")[0] ).split('/') mid = url_decode(hrefl[-1]) d['uid'] = int(hrefl[-2]) d['source'] = unicode(e.xpath("./a[last()]/text()")[0]) return d, mid
def _get_filename_by_url(self, url): try: import re result = re.match(r"[^:]+://[^/]+/?([^?#]*)",url).groups()[0] result = result.split('/')[-1] if result: return url_decode(result) else: return "download" except Exception: return "download"
def _get_filename_by_url(self, url): try: import re result = re.match(r"[^:]+://[^/]+/?([^?#]*)", url).groups()[0] result = result.split('/')[-1] if result: return url_decode(result) else: return "download" except Exception: return "download"
def _filename_from_content_disposition(self, content_disposition): # rfc2183 disposition = content_disposition.split(';') for i in xrange(1, len(disposition)): disposition_parm = disposition[i].split('=') if len(disposition_parm) > 1 and disposition_parm[0].strip() == 'filename': filename = url_decode(disposition_parm[1].strip('"')) if len(filename) > 0: return filename else: break return None
def _filename_from_content_disposition(self, content_disposition): # rfc2183 disposition = content_disposition.split(';') for i in xrange(1, len(disposition)): disposition_parm = disposition[i].split('=') if len(disposition_parm) > 1 and disposition_parm[0].strip( ) == 'filename': filename = url_decode(disposition_parm[1].strip('"')) if len(filename) > 0: return filename else: break return None