Ejemplo n.º 1
0
 def parse(self,response):
     '''分析网页内容'''
     response.url = response.url.strip()
     ext=funcs.get_url_ext(response.url)                     #获取文件扩展名
     #wiki 的css页面为http://bits.wikimedia.org开头的php,不分析
     #if (ext not in settings.S_img_ext) and (ext not in ('css','js')) and not response.url.startswith('http://bits.wikimedia.org'):
     if funcs.is_need_modify(response.url):
         data,coding=funcs.decode_data(response.body)
         soup=BeautifulSoup(str(data),'lxml',from_encoding='utf-8')
         soup,urls,css_urls,js_urls,img_urls=self.get_link(soup)
         all_urls=css_urls+js_urls+urls+img_urls
         
         for url in all_urls:
             vurl=funcs.valid_url(response.url,url).strip()      #判断是否有效链接
             if vurl != '':
                 #下载简体中文的网页
                 vurl = funcs.translate_simplify( vurl )
                 _url=funcs.decode_data(vurl)[0].encode('utf-8')
                 print _url
                 if _url:
                     vurl=_url
                 yield Request(vurl)
                 
         item=Item()
         item.url=response.url
         item.soup=soup
         item.content=str(soup)                      #使用修改后的数据
         item.coding=coding                          #内容编码
         item.file_length=int(len(response.body))    #原始文件大小
         yield item
     else:
         item=Item()
         item.url=response.url
         item.soup=None
         item.content=response.body                  #使用修改后的数据
         item.coding=''                              #内容编码
         item.file_length=int(len(response.body))    #原始文件大小
         yield item
Ejemplo n.º 2
0
    def modify_tree(self,item):
        '''更改网页链接'''
        item.soup=self.both_need_clear(item.soup)
        #删除csdn不需要的东西
        if settings.S_target_website=='csdn':
            item.soup=self.csdn_clear(item.soup)
        #删除iteye不需要的东西
        if settings.S_target_website=='iteye':
            item.soup=self.iteye_clear(item.soup)
        #修改a链接
        a_links=item.soup.find_all('a',href=True)
        for a in a_links:
            href=a.get('href','')
            full_href = urlparse.urljoin(item.url,href)
            a['href'] = funcs.transform_url( full_href )
#            if '#' in href:
#                url_parts=href.split('#')         #去掉锚点
#                turl=url_parts[0].strip()
#                if len(url_parts)>=2:
#                    anchor=url_parts[1].strip()
#                else:
#                    anchor=''
#            else:
#                turl=href
#                anchor=''
#            if turl=='' or turl.startswith('javascript') or turl.startswith('#'):
#                continue
#            if funcs.valid_url(item.url,turl):
#                c_href=urlparse.urljoin(item.url,turl)
#                if len(re.findall('/',c_href))==2:         #使http://www.csdn.net变成http://www.csdn.net/
#                    c_href=c_href+'/'
#                        
#                sub_dir=funcs.url2path(c_href)
#                c_idx=funcs.get_md52int(c_href)
#                c_href=self.url_prefix_main+sub_dir
#                dir_path=os.path.dirname(c_href)
#                c_href=dir_path+'/'+str(c_idx)+'.html'
#                if anchor:
#                    a['href']=c_href+'#'+anchor 
#                else:
#                    a['href']=c_href
        #修改css链接
        css_links=item.soup.find_all('link',href=True)        
        for css in css_links:
            href=css.get('href','')

            full_href = urlparse.urljoin(item.url,href)
            if href.startswith('http://bits.wikimedia.org/'):
                css['href'] = funcs.transform_url( full_href )
            else:
                css['href'] = ''
#            if href.startswith('//bits.wikimedia.org/'):
#                #wiki特殊的css链接,为php,因此需要按照a链接那样分析
#                href = 'http:' + href
#                
#                if '#' in href:
#                    url_parts=href.split('#')         #去掉锚点
#                    turl=url_parts[0].strip()
#                    if len(url_parts)>=2:
#                        anchor=url_parts[1].strip()
#                    else:
#                        anchor=''
#                else:
#                    turl=href
#                    anchor=''
#                if turl=='' or turl.startswith('javascript') or turl.startswith('#'):
#                    continue
#                if funcs.valid_url(item.url,turl):
#                    c_href=urlparse.urljoin(item.url,turl)
#                    if len(re.findall('/',c_href))==2:         #使http://www.csdn.net变成http://www.csdn.net/
#                        c_href=c_href+'/'
#                            
#                    sub_dir=funcs.url2path(c_href)
#                    c_idx=funcs.get_md52int(c_href)
#                    c_href=self.url_prefix_main+sub_dir
#                    dir_path=os.path.dirname(c_href)
#                    c_href=dir_path+'/'+str(c_idx)+'.html'
#                    if anchor:
#                        css['href']=c_href+'#'+anchor 
#                    else:
#                        css['href']=c_href
#            else:
#                if funcs.valid_url(item.url,href):
#                    c_href=urlparse.urljoin(item.url,href)
#                    sub_url=urlparse.urlparse(c_href)[2]       #使http://www.csdn.net/../sdf变成http://www.csdn.net/sdf
#                    if sub_url.startswith('/../'):
#                        sub_url=sub_url[3:]
#                    c_href=self.url_prefix_main+'/css_js'+sub_url
#                    css['href']=c_href
#                #把无关的css链接去掉
#                css['href'] = ''
        #修改js链接
        js_links=item.soup.find_all('script',src=True)        
        for js in js_links:
            href=js.get('script','')
            if funcs.valid_url(item.url,href):
                c_href=urlparse.urljoin(item.url,href)
                sub_url=urlparse.urlparse(c_href)[2]       #使http://www.csdn.net/../sdf变成http://www.csdn.net/sdf
                if sub_url.startswith('/../'):
                    sub_url=sub_url[3:]
                c_href=self.url_prefix_main+'/css_js'+sub_url
                js['script']=c_href
        return item