def __init__(self, site_id, Name, runtime_queue, list, per_max_num, Flcok, home_urls): ''' site_id: 获得相应的目录 ''' threading.Thread.__init__(self, name=Name) self.runtime_queue = runtime_queue #self.result = result #路径管理 self.path = path(site_id) self.num = 0 self.maxnum = per_max_num self.list = list self.Flcok = Flcok #self.sqlite=sqlite3.connect('store/qlin.db') self.urltest = Urltest(home_urls) self.htmlparser = Collector() self.collector = collector(home_urls) #初始化home_list self.home_urls = home_urls self.inqueue = Queue() #开始对原始目录进行清扫 #建立站点 self.path.mk_dir(self.path.g_site()) #urltest self.path.rm_file(self.path.g_urltest()) #晴空document self.path.clean_dir(self.path.g_document())
def __init__(self,site_id, Name, runtime_queue, list, per_max_num ,Flcok,home_urls): ''' site_id: 获得相应的目录 ''' threading.Thread.__init__(self, name = Name ) self.runtime_queue = runtime_queue #self.result = result #路径管理 self.path = path(site_id) self.num = 0 self.maxnum = per_max_num self.list=list self.Flcok=Flcok #self.sqlite=sqlite3.connect('store/qlin.db') self.urltest=Urltest(home_urls) self.htmlparser=Collector() self.collector=collector(home_urls) #初始化home_list self.home_urls=home_urls self.inqueue = Queue() #开始对原始目录进行清扫 #建立站点 self.path.mk_dir( self.path.g_site() ) #urltest self.path.rm_file( self.path.g_urltest() ) #晴空document self.path.clean_dir( self.path.g_document() )
def run(self,site_id): ''' 综合路径管理 ''' self.site_id = site_id path = path(site_id) #分词 self.p=Parser(path.g_document(),path.g_wordsplit(),path.g_wordbar()) self.url_sort(path.g_urltest(), path.g_sorted_url()) self.parser() #修尬文件名称 self.url_trans_dir(path.g_sorted_url(), path.g_document(), path.g_wordsplit()) #index self.p.transWbar() #数据库处理 self.title_des(path.g_sorted_url()) self.index(path.g_wordbar(), path.g_wordsplit() ,path.g_hits()) #进行排序 self.sort_hit( path.g_hit_size(),path.g_hits()+'/' )
def __init__(self,site_id): ''' 初始化各项目录 ''' self.path = path(site_id) self.ict=Ictclas('ICTCLAS50/') #self.wordbar=wordlist()#wordBar self.spword='@chunwei@' self.xmlph=self.path.g_document() self.wsplitph=self.path.g_wordsplit() self.wbpath=self.path.g_wordbar() #初始化词库 self.wordbar = Thes.Create_Thesaurus(self.wbpath) #数据库相关 self.cx = sq.connect(self.path.g_chun_sqlite()) self.cu = self.cx.cursor()
def __init__(self, site_id): ''' 初始化各项目录 ''' self.path = path(site_id) self.ict = Ictclas('ICTCLAS50/') #self.wordbar=wordlist()#wordBar self.spword = '@chunwei@' self.xmlph = self.path.g_document() self.wsplitph = self.path.g_wordsplit() self.wbpath = self.path.g_wordbar() #初始化词库 self.wordbar = Thes.Create_Thesaurus(self.wbpath) #数据库相关 self.cx = sq.connect(self.path.g_chun_sqlite()) self.cu = self.cx.cursor()
def __init__(self,site_id): ''' init ''' self.path = path(site_id) #临时性设计 需要过滤无用字符 self.str_test = re.compile("(\w|=|'|&|:)") self.length = 0 self.cx = sq.connect(self.path.g_chun_sqlite()) self.cu = self.cx.cursor() self.ict=Ictclas.Ictclas('ICTCLAS50/') self.urlbar = urlbar(self.path.g_sorted_url())
def run(self, site_id): ''' 运行主程序 与数据库配合 载入site_id 将相关信息载入 ''' #目录管理 p = path(site_id) if os.path.exists('store/sites/' + str(site_id)): p.rm_file(p.g_urltest()) p.clean_dir(p.g_document()) else: os.mkdir('store/sites/' + str(site_id)) runtime_queue = Queue() list = Urlist() Flock = threading.RLock() thlist = [] site_infor = self.site.gets(site_id) startpage = site_infor[2] #尝试添加home_urls home_urls = site_infor[3].split('\r\n') head = site_infor[4] per_max_num = site_infor[5] for i in range(self.thread_num): #此处前缀也需要变化 #修改 根据站点前缀命名爬虫 th = reptile(site_id, head + str(i), runtime_queue, list, per_max_num, Flock, home_urls) thlist.append(th) for i in thlist: i.start() runtime_queue.put(startpage)
def run(self,site_id): ''' 运行主程序 与数据库配合 载入site_id 将相关信息载入 ''' #目录管理 p = path(site_id) if os.path.exists('store/sites/'+str(site_id)): p.rm_file(p.g_urltest()) p.clean_dir(p.g_document()) else: os.mkdir('store/sites/'+str(site_id)) runtime_queue = Queue() list = Urlist() Flock = threading.RLock() thlist = [] site_infor = self.site.gets(site_id) startpage= site_infor[2] #尝试添加home_urls home_urls = site_infor[3].split('\r\n') head = site_infor[4] per_max_num = site_infor[5] for i in range(self.thread_num): #此处前缀也需要变化 #修改 根据站点前缀命名爬虫 th = reptile(site_id,head + str(i), runtime_queue,list,per_max_num ,Flock,home_urls) thlist.append(th) for i in thlist: i.start() runtime_queue.put(startpage)
from index.indexer import Indexer,Sort_hits from query.path import path p = path(0) ''' index=Indexer(0) index.run() ''' hit_sort = Sort_hits(p.g_hit_size()) for i in range(20): hit_sort.sort_wid(p.g_hits()+'/',i) hit_sort.save(p.g_hits()+'/',i)
def run(self,site_id): ''' 运行主程序 ''' p = path(site_id) #初始化目录 #清空wordsplit p.clean_dir(p.g_wordsplit()) #清空hits p.clean_dir(p.g_hits()) #初始化数据库 p.cp_chun() ################################################################### # 解析 parser # ################################################################### parser = Parser(site_id) ################################################################### # url处理 url_sort # ################################################################### url_trans = UrlTransID(p.g_urltest()) #将url进行排序 url_trans.sort() #存储为 sorted_url.txt url_trans.save(p.g_sorted_url()) ################################################################### # parser 进行分词 # ################################################################### parser.splitWord() ################################################################### # 根据 docID 修改文件名 # ################################################################### url_trans_dir = UrlTransDir(p.g_sorted_url()) #对document进行重命名 url_trans_dir.renameDoc( p.g_document() ) #对wrdsplit进行重命名 url_trans_dir.renameDoc( p.g_wordsplit() ) ################################################################### # parser 产生词库 wordbar # ################################################################### parser.transWbar() ################################################################### # title_处理 # ################################################################### title_des_sqlite = Title_des_sqlite(site_id) #对原始数据进行刷新 title_des_sqlite.clear() title_des_sqlite.run() title_des_sqlite.add_url() title_des_sqlite.intro_split_des_title() title_des_sqlite.cx.commit() ################################################################### # index 索引操作 # ################################################################### index = Indexer(site_id) index.run() ################################################################### # index 对hits进行排序 # ################################################################### hit_sort = Sort_hits(p.g_hit_size()) for i in range(20): hit_sort.sort_wid(p.g_hits()+'/',i) hit_sort.save(p.g_hits()+'/',i)
def main(self): ''' 主站 前期处理程序 对于index 已经利用STEP 统一了划分index的快数目 基本思想: 将 所有站点信息融合到一起 最后进行处理 必须在 子站之前进行 ''' p = path(0) ################################################################### # 初始化路径 # ################################################################### #清空document 为复制其余document作准备 p.clean_dir(p.g_document()) ################################################################### # 复制document # ################################################################### for li in os.listdir('store/sites'): print li site_document = os.path.join('store/sites',li,'document') print site_document #print site_path for f in os.listdir( site_document): #开始将每个文件复制到document中 file_path = os.path.join(site_document,f) shutil.copyfile( file_path,'store/document/'+f) print 'successfully copy %s to document'%os.path.join(site_document,f) ################################################################### # 复制urltest # ################################################################### #现清空urltest p.rm_file( p.g_urltest() ) u_file = open( p.g_urltest() , 'a+') for site_dir in os.listdir( 'store/sites/'): url_ph = os.path.join('store/sites',site_dir,'urltest.txt') f = open(url_ph) c = f.read() f.close() #附加到u_file后 u_file.write(c) u_file.close() p.clean_dir(p.g_wordsplit()) #清空hits p.clean_dir(p.g_hits()) #初始化数据库 p.cp_chun() ################################################################### # 复制urltest # ################################################################### #对总站点进行处理 self.run(0)