Exemple #1
0
 def get_table_data(self,table_name):
     '''读取数据库表的数据'''
     print '    reading %s table'%table_name
     sql='select count(*) from %s'%table_name
     self.cur.execute(sql)
     count=self.cur.fetchone()[0]
     record_num=0
     if count>0:
         sql='select min(id),max(id) from %s'%table_name
         self.cur.execute(sql)
         min_id,max_id=self.cur.fetchone()            
         print '    ',min_id,max_id
         step=10**6
         idx=min_id
         while idx<=max_id:
             sql_pref='select id,url from %s'%table_name
             sql=sql_pref+' where id between %s and %s'
             self.cur.execute(sql,(str(idx),str(idx+step)))
             for record in self.cur.fetchall():
                 if 'queue' in table_name:
                     #url_queue的md5值需要重新计算
                     url=record[1]
                     i_md5=funcs.get_md52int(url)
                 else:
                     _id=record[0]
                     i_md5=int(_id)
                 if self.set_url_mark(i_md5):
                     record_num+=1
             idx+=step
     print '    done,%s has %d records'%(table_name,record_num)
Exemple #2
0
 def add_request(self,request,cur,cxn):
     '''添加网址进数据库表'''
     self.d_locks['add_request_lock'].acquire()
     l_md5=funcs.get_md52int(request.url)
     if self.set_url_mark(l_md5):
         sql_prefix='insert into %s_queue'%settings.S_target_website
         sql=sql_prefix+' (url) values(%s)'
         cur.execute(sql,request.url)
         cxn.commit()
     else:
         pass
     self.d_locks['add_request_lock'].release()
Exemple #3
0
 def update_css_js(self,):
     #替换css和js和部分链接
     for href in self.item.all_hrefs:
         turl=href.split('#')[0].strip()         #去掉锚点
         if turl=='' or turl.startswith('javascript') or turl.startswith('#'):
             continue
         
         ext=funcs.get_url_ext(href).lower()
         if ext in ('css','js'):
             c_href=urlparse.urljoin(self.item.url,href)
             sub_url=urlparse.urlparse(c_href)[2]       #使http://www.csdn.net/../sdf变成http://www.csdn.net/sdf
             if sub_url.startswith('/../'):
                 sub_url=sub_url[3:]
             c_href=self.url_prefix_main+'/css_js'+sub_url
             patt_prefix='href'
         elif ext in ('jpg','png','gif','jpeg'):
             c_href=urlparse.urljoin(self.item.url,href)
             sub_url=urlparse.urlparse(c_href)[2]       #使http://www.csdn.net/../sdf变成http://www.csdn.net/sdf
             if sub_url.startswith('/../'):
                 sub_url=sub_url[3:]
             domain_name=funcs.url2domain(c_href)
             c_href=self.url_prefix_main+'/'+domain_name+sub_url
             patt_prefix='src'
         else:
             c_href=urlparse.urljoin(self.item.url,href)
             if len(re.findall('/',c_href))==2:         #使http://www.csdn.net变成http://www.csdn.net/
                 c_href=c_href+'/'
                 
             domain_name=funcs.url2domain(c_href)
             c_idx=funcs.get_md52int(c_href)
             c_href=self.url_prefix_main+'/'+domain_name+urlparse.urlparse(c_href)[2]
             dir_path=os.path.dirname(c_href)
             c_href=dir_path+'/'+str(c_idx)+'.html'
             patt_prefix='href'
         try:
             p_href=funcs.get_re_patt(href)
             patt=patt_prefix+'=[\'"]'+p_href+'[\'"]'
             c_href=patt_prefix+'="'+c_href+'"'
             self.item.content=re.sub(patt,c_href,self.item.content)                
         except Exception,e:
             print 'update_css_js:'+str(e)
Exemple #4
0
    def update_file_path(self,item,root_dir,sub_dir):   
        '''改变网页的名字和路径'''
        item.idx=funcs.get_md52int(item.url)                #计算文件的id号  
#        domain_name=funcs.url2domain(item.url)
#        ext=funcs.get_url_ext(item.url)                     #获取文件扩展名
#        if ext in ('css','js'):
#            file_path=root_dir+self.url_prefix_delta+'/css_js'+sub_dir    #css和js不用改名,放在css_js文件夹
#        elif ext in settings.S_img_ext:
#            file_path=root_dir+self.url_prefix_delta+'/'+domain_name+sub_dir
#        else:
            #将html文件名改成id号
#            sub_dir=funcs.url2path(item.url)
#            dir_path=os.path.dirname(root_dir+self.url_prefix_delta+sub_dir)
#            file_path=dir_path+'/'+str(item.idx)+'.html'
        turl, anchor = funcs.get_url_anchor( item.url )
        #传True,表示转换的是文件保存的路径
        sub_dir = funcs.transform_url( turl, True )
        #file_path=os.path.dirname(root_dir+self.url_prefix_delta+sub_dir)
        file_path=root_dir + sub_dir
        #print file_path
        return item,file_path