def parse(self,url,path): f=open(path) data=f.read() f.close() ext=funcs.get_url_ext(url) if ext in ('php','html','htm','asp','aspx','jsp'): data,coding=funcs.decode_data(data) soup=BeautifulSoup(str(data),'html5lib',from_encoding='utf-8') urls,css_urls,js_urls,img_urls=self.get_link(soup) all_hrefs=css_urls+js_urls+urls+img_urls self.item.url=url self.item.content=str(soup) #使用修改后的数据 self.item.coding=coding #内容编码 self.item.all_hrefs=all_hrefs self.item=self.update_css_js(self.item) content=str(self.item.content).encode(self.item.coding,'ignore') else: content=data patt='[\s\S]+?(/media/html/[\s\S]+)' m=re.search(patt,path) if m: save_path=S_save_root_dir+m.group(1) else: print path,'is a wrong path' save_path='' if save_path: print save_path if self.check_dir_path(os.path.dirname(save_path)): f=open(save_path,'w') f.write(content) f.close()
def save_data(self,path,item): '''保存网页''' try: ext=funcs.get_url_ext(item.url) if ext in settings.S_img_ext: w_type='wb' data=item.content elif ext in ('css','js') or item.url.startswith('http://bits.wikimedia.org'): w_type='w' data=item.content else: w_type='w' if item.coding: #data=str(item.soup).decode('utf-8','ignore').encode(item.coding,'ignore') #data=item.soup.prettify(item.coding) data=item.soup.prettify() #都转换为utf-8 else: #data=str(item.soup) data=item.soup.prettify() if len(data.strip())>0: f=open(path,w_type) f.write(data) f.close() return True else: raise Exception(item.url+' content is empty') except Exception,e: msg='pipeline: '+item.url+' save fail :'+str(e) print msg,str(e) save_fail_log=os.path.abspath('.').replace('\\','/')+'/save_fail.txt' f=open(save_fail_log,'a') f.write(msg+'\n') f.close() return False
def update_css_js(self,): #替换css和js和部分链接 for href in self.item.all_hrefs: turl=href.split('#')[0].strip() #去掉锚点 if turl=='' or turl.startswith('javascript') or turl.startswith('#'): continue ext=funcs.get_url_ext(href).lower() if ext in ('css','js'): c_href=urlparse.urljoin(self.item.url,href) sub_url=urlparse.urlparse(c_href)[2] #使http://www.csdn.net/../sdf变成http://www.csdn.net/sdf if sub_url.startswith('/../'): sub_url=sub_url[3:] c_href=self.url_prefix_main+'/css_js'+sub_url patt_prefix='href' elif ext in ('jpg','png','gif','jpeg'): c_href=urlparse.urljoin(self.item.url,href) sub_url=urlparse.urlparse(c_href)[2] #使http://www.csdn.net/../sdf变成http://www.csdn.net/sdf if sub_url.startswith('/../'): sub_url=sub_url[3:] domain_name=funcs.url2domain(c_href) c_href=self.url_prefix_main+'/'+domain_name+sub_url patt_prefix='src' else: c_href=urlparse.urljoin(self.item.url,href) if len(re.findall('/',c_href))==2: #使http://www.csdn.net变成http://www.csdn.net/ c_href=c_href+'/' domain_name=funcs.url2domain(c_href) c_idx=funcs.get_md52int(c_href) c_href=self.url_prefix_main+'/'+domain_name+urlparse.urlparse(c_href)[2] dir_path=os.path.dirname(c_href) c_href=dir_path+'/'+str(c_idx)+'.html' patt_prefix='href' try: p_href=funcs.get_re_patt(href) patt=patt_prefix+'=[\'"]'+p_href+'[\'"]' c_href=patt_prefix+'="'+c_href+'"' self.item.content=re.sub(patt,c_href,self.item.content) except Exception,e: print 'update_css_js:'+str(e)
def pipeline(self,item): '''更改网页的链接,保存网页''' if item.url=='': return root_dir=settings.S_root_dir sub_dir=urlparse.urlparse(item.url)[2] ext=funcs.get_url_ext(item.url) #获取文件扩展名 item,file_path=self.update_file_path(item,root_dir,sub_dir) #wiki 的css页面为http://bits.wikimedia.org开头的php,不分析 #if (ext not in settings.S_img_ext) and (ext not in ('css','js')) and not item.url.startswith('http://bits.wikimedia.org'): if funcs.is_need_modify(item.url): #item=self.update_css_js(item) #print item.url item=self.modify_tree(item) if self.record(item): if self.check_dir_path(os.path.dirname(file_path)): if self.save_data(file_path,item): print 'pipleline: '+str(self.thread_id)+' : '+str(item.idx)+' : '+item.url else: print 'pipleline: '+str(self.thread_id)+' : no need to update '+str(item.idx)+' : '+item.url
def parse(self,response): '''分析网页内容''' response.url = response.url.strip() ext=funcs.get_url_ext(response.url) #获取文件扩展名 #wiki 的css页面为http://bits.wikimedia.org开头的php,不分析 #if (ext not in settings.S_img_ext) and (ext not in ('css','js')) and not response.url.startswith('http://bits.wikimedia.org'): if funcs.is_need_modify(response.url): data,coding=funcs.decode_data(response.body) soup=BeautifulSoup(str(data),'lxml',from_encoding='utf-8') soup,urls,css_urls,js_urls,img_urls=self.get_link(soup) all_urls=css_urls+js_urls+urls+img_urls for url in all_urls: vurl=funcs.valid_url(response.url,url).strip() #判断是否有效链接 if vurl != '': #下载简体中文的网页 vurl = funcs.translate_simplify( vurl ) _url=funcs.decode_data(vurl)[0].encode('utf-8') print _url if _url: vurl=_url yield Request(vurl) item=Item() item.url=response.url item.soup=soup item.content=str(soup) #使用修改后的数据 item.coding=coding #内容编码 item.file_length=int(len(response.body)) #原始文件大小 yield item else: item=Item() item.url=response.url item.soup=None item.content=response.body #使用修改后的数据 item.coding='' #内容编码 item.file_length=int(len(response.body)) #原始文件大小 yield item
f.close() return True else: raise Exception(item.url+' content is empty') except Exception,e: msg='pipeline: '+item.url+' save fail :'+str(e) print msg,str(e) save_fail_log=os.path.abspath('.').replace('\\','/')+'/save_fail.txt' f=open(save_fail_log,'a') f.write(msg+'\n') f.close() return False if __name__=='__main__': ext=funcs.get_url_ext('http://asdf.com/sdf.css') if ext in ('css','js'): print ext