def get_table_data(self,table_name): '''读取数据库表的数据''' print ' reading %s table'%table_name sql='select count(*) from %s'%table_name self.cur.execute(sql) count=self.cur.fetchone()[0] record_num=0 if count>0: sql='select min(id),max(id) from %s'%table_name self.cur.execute(sql) min_id,max_id=self.cur.fetchone() print ' ',min_id,max_id step=10**6 idx=min_id while idx<=max_id: sql_pref='select id,url from %s'%table_name sql=sql_pref+' where id between %s and %s' self.cur.execute(sql,(str(idx),str(idx+step))) for record in self.cur.fetchall(): if 'queue' in table_name: #url_queue的md5值需要重新计算 url=record[1] i_md5=funcs.get_md52int(url) else: _id=record[0] i_md5=int(_id) if self.set_url_mark(i_md5): record_num+=1 idx+=step print ' done,%s has %d records'%(table_name,record_num)
def add_request(self,request,cur,cxn): '''添加网址进数据库表''' self.d_locks['add_request_lock'].acquire() l_md5=funcs.get_md52int(request.url) if self.set_url_mark(l_md5): sql_prefix='insert into %s_queue'%settings.S_target_website sql=sql_prefix+' (url) values(%s)' cur.execute(sql,request.url) cxn.commit() else: pass self.d_locks['add_request_lock'].release()
def update_css_js(self,): #替换css和js和部分链接 for href in self.item.all_hrefs: turl=href.split('#')[0].strip() #去掉锚点 if turl=='' or turl.startswith('javascript') or turl.startswith('#'): continue ext=funcs.get_url_ext(href).lower() if ext in ('css','js'): c_href=urlparse.urljoin(self.item.url,href) sub_url=urlparse.urlparse(c_href)[2] #使http://www.csdn.net/../sdf变成http://www.csdn.net/sdf if sub_url.startswith('/../'): sub_url=sub_url[3:] c_href=self.url_prefix_main+'/css_js'+sub_url patt_prefix='href' elif ext in ('jpg','png','gif','jpeg'): c_href=urlparse.urljoin(self.item.url,href) sub_url=urlparse.urlparse(c_href)[2] #使http://www.csdn.net/../sdf变成http://www.csdn.net/sdf if sub_url.startswith('/../'): sub_url=sub_url[3:] domain_name=funcs.url2domain(c_href) c_href=self.url_prefix_main+'/'+domain_name+sub_url patt_prefix='src' else: c_href=urlparse.urljoin(self.item.url,href) if len(re.findall('/',c_href))==2: #使http://www.csdn.net变成http://www.csdn.net/ c_href=c_href+'/' domain_name=funcs.url2domain(c_href) c_idx=funcs.get_md52int(c_href) c_href=self.url_prefix_main+'/'+domain_name+urlparse.urlparse(c_href)[2] dir_path=os.path.dirname(c_href) c_href=dir_path+'/'+str(c_idx)+'.html' patt_prefix='href' try: p_href=funcs.get_re_patt(href) patt=patt_prefix+'=[\'"]'+p_href+'[\'"]' c_href=patt_prefix+'="'+c_href+'"' self.item.content=re.sub(patt,c_href,self.item.content) except Exception,e: print 'update_css_js:'+str(e)
def update_file_path(self,item,root_dir,sub_dir): '''改变网页的名字和路径''' item.idx=funcs.get_md52int(item.url) #计算文件的id号 # domain_name=funcs.url2domain(item.url) # ext=funcs.get_url_ext(item.url) #获取文件扩展名 # if ext in ('css','js'): # file_path=root_dir+self.url_prefix_delta+'/css_js'+sub_dir #css和js不用改名,放在css_js文件夹 # elif ext in settings.S_img_ext: # file_path=root_dir+self.url_prefix_delta+'/'+domain_name+sub_dir # else: #将html文件名改成id号 # sub_dir=funcs.url2path(item.url) # dir_path=os.path.dirname(root_dir+self.url_prefix_delta+sub_dir) # file_path=dir_path+'/'+str(item.idx)+'.html' turl, anchor = funcs.get_url_anchor( item.url ) #传True,表示转换的是文件保存的路径 sub_dir = funcs.transform_url( turl, True ) #file_path=os.path.dirname(root_dir+self.url_prefix_delta+sub_dir) file_path=root_dir + sub_dir #print file_path return item,file_path