def parse(self,response): '''分析网页内容''' response.url = response.url.strip() ext=funcs.get_url_ext(response.url) #获取文件扩展名 #wiki 的css页面为http://bits.wikimedia.org开头的php,不分析 #if (ext not in settings.S_img_ext) and (ext not in ('css','js')) and not response.url.startswith('http://bits.wikimedia.org'): if funcs.is_need_modify(response.url): data,coding=funcs.decode_data(response.body) soup=BeautifulSoup(str(data),'lxml',from_encoding='utf-8') soup,urls,css_urls,js_urls,img_urls=self.get_link(soup) all_urls=css_urls+js_urls+urls+img_urls for url in all_urls: vurl=funcs.valid_url(response.url,url).strip() #判断是否有效链接 if vurl != '': #下载简体中文的网页 vurl = funcs.translate_simplify( vurl ) _url=funcs.decode_data(vurl)[0].encode('utf-8') print _url if _url: vurl=_url yield Request(vurl) item=Item() item.url=response.url item.soup=soup item.content=str(soup) #使用修改后的数据 item.coding=coding #内容编码 item.file_length=int(len(response.body)) #原始文件大小 yield item else: item=Item() item.url=response.url item.soup=None item.content=response.body #使用修改后的数据 item.coding='' #内容编码 item.file_length=int(len(response.body)) #原始文件大小 yield item
def modify_tree(self,item): '''更改网页链接''' item.soup=self.both_need_clear(item.soup) #删除csdn不需要的东西 if settings.S_target_website=='csdn': item.soup=self.csdn_clear(item.soup) #删除iteye不需要的东西 if settings.S_target_website=='iteye': item.soup=self.iteye_clear(item.soup) #修改a链接 a_links=item.soup.find_all('a',href=True) for a in a_links: href=a.get('href','') full_href = urlparse.urljoin(item.url,href) a['href'] = funcs.transform_url( full_href ) # if '#' in href: # url_parts=href.split('#') #去掉锚点 # turl=url_parts[0].strip() # if len(url_parts)>=2: # anchor=url_parts[1].strip() # else: # anchor='' # else: # turl=href # anchor='' # if turl=='' or turl.startswith('javascript') or turl.startswith('#'): # continue # if funcs.valid_url(item.url,turl): # c_href=urlparse.urljoin(item.url,turl) # if len(re.findall('/',c_href))==2: #使http://www.csdn.net变成http://www.csdn.net/ # c_href=c_href+'/' # # sub_dir=funcs.url2path(c_href) # c_idx=funcs.get_md52int(c_href) # c_href=self.url_prefix_main+sub_dir # dir_path=os.path.dirname(c_href) # c_href=dir_path+'/'+str(c_idx)+'.html' # if anchor: # a['href']=c_href+'#'+anchor # else: # a['href']=c_href #修改css链接 css_links=item.soup.find_all('link',href=True) for css in css_links: href=css.get('href','') full_href = urlparse.urljoin(item.url,href) if href.startswith('http://bits.wikimedia.org/'): css['href'] = funcs.transform_url( full_href ) else: css['href'] = '' # if href.startswith('//bits.wikimedia.org/'): # #wiki特殊的css链接,为php,因此需要按照a链接那样分析 # href = 'http:' + href # # if '#' in href: # url_parts=href.split('#') #去掉锚点 # turl=url_parts[0].strip() # if len(url_parts)>=2: # anchor=url_parts[1].strip() # else: # anchor='' # else: # turl=href # anchor='' # if turl=='' or turl.startswith('javascript') or turl.startswith('#'): # continue # if funcs.valid_url(item.url,turl): # c_href=urlparse.urljoin(item.url,turl) # if len(re.findall('/',c_href))==2: #使http://www.csdn.net变成http://www.csdn.net/ # c_href=c_href+'/' # # sub_dir=funcs.url2path(c_href) # c_idx=funcs.get_md52int(c_href) # c_href=self.url_prefix_main+sub_dir # dir_path=os.path.dirname(c_href) # c_href=dir_path+'/'+str(c_idx)+'.html' # if anchor: # css['href']=c_href+'#'+anchor # else: # css['href']=c_href # else: # if funcs.valid_url(item.url,href): # c_href=urlparse.urljoin(item.url,href) # sub_url=urlparse.urlparse(c_href)[2] #使http://www.csdn.net/../sdf变成http://www.csdn.net/sdf # if sub_url.startswith('/../'): # sub_url=sub_url[3:] # c_href=self.url_prefix_main+'/css_js'+sub_url # css['href']=c_href # #把无关的css链接去掉 # css['href'] = '' #修改js链接 js_links=item.soup.find_all('script',src=True) for js in js_links: href=js.get('script','') if funcs.valid_url(item.url,href): c_href=urlparse.urljoin(item.url,href) sub_url=urlparse.urlparse(c_href)[2] #使http://www.csdn.net/../sdf变成http://www.csdn.net/sdf if sub_url.startswith('/../'): sub_url=sub_url[3:] c_href=self.url_prefix_main+'/css_js'+sub_url js['script']=c_href return item