Example #1
0
    def __init__(self, site_id, Name, runtime_queue, list, per_max_num, Flcok,
                 home_urls):
        '''
        site_id:
            获得相应的目录
        '''

        threading.Thread.__init__(self, name=Name)
        self.runtime_queue = runtime_queue
        #self.result = result
        #路径管理
        self.path = path(site_id)

        self.num = 0
        self.maxnum = per_max_num
        self.list = list
        self.Flcok = Flcok
        #self.sqlite=sqlite3.connect('store/qlin.db')

        self.urltest = Urltest(home_urls)
        self.htmlparser = Collector()
        self.collector = collector(home_urls)
        #初始化home_list
        self.home_urls = home_urls
        self.inqueue = Queue()

        #开始对原始目录进行清扫
        #建立站点
        self.path.mk_dir(self.path.g_site())
        #urltest
        self.path.rm_file(self.path.g_urltest())
        #晴空document
        self.path.clean_dir(self.path.g_document())
Example #2
0
    def __init__(self,site_id, Name, runtime_queue, list, per_max_num ,Flcok,home_urls):  
        '''
        site_id:
            获得相应的目录
        '''

        threading.Thread.__init__(self, name = Name )  
        self.runtime_queue = runtime_queue  
        #self.result = result  
        #路径管理
        self.path = path(site_id)

        self.num = 0          
        self.maxnum = per_max_num
        self.list=list
        self.Flcok=Flcok
        #self.sqlite=sqlite3.connect('store/qlin.db')
         
        self.urltest=Urltest(home_urls)
        self.htmlparser=Collector()
        self.collector=collector(home_urls)
        #初始化home_list
        self.home_urls=home_urls
        self.inqueue = Queue()
        
        #开始对原始目录进行清扫
        #建立站点
        self.path.mk_dir( self.path.g_site() )
        #urltest
        self.path.rm_file( self.path.g_urltest() )
        #晴空document
        self.path.clean_dir( self.path.g_document() )
Example #3
0
    def run(self,site_id):
        '''
        综合路径管理
        '''
        self.site_id = site_id

        path = path(site_id)

        #分词
        self.p=Parser(path.g_document(),path.g_wordsplit(),path.g_wordbar())

        self.url_sort(path.g_urltest(), path.g_sorted_url())

        self.parser()
        #修尬文件名称
        self.url_trans_dir(path.g_sorted_url(), path.g_document(), path.g_wordsplit())
        #index
        self.p.transWbar()

        #数据库处理
        self.title_des(path.g_sorted_url())
    
        self.index(path.g_wordbar(), path.g_wordsplit() ,path.g_hits())
        #进行排序
        self.sort_hit( path.g_hit_size(),path.g_hits()+'/' )
Example #4
0
    def __init__(self,site_id):
        '''
        初始化各项目录
        '''
        self.path = path(site_id)

        self.ict=Ictclas('ICTCLAS50/') 
        #self.wordbar=wordlist()#wordBar
        self.spword='@chunwei@' 
        
        self.xmlph=self.path.g_document()
        self.wsplitph=self.path.g_wordsplit()
        self.wbpath=self.path.g_wordbar()

        #初始化词库 
        self.wordbar = Thes.Create_Thesaurus(self.wbpath)

        #数据库相关
        self.cx = sq.connect(self.path.g_chun_sqlite())
        self.cu = self.cx.cursor()
Example #5
0
    def __init__(self, site_id):
        '''
        初始化各项目录
        '''
        self.path = path(site_id)

        self.ict = Ictclas('ICTCLAS50/')
        #self.wordbar=wordlist()#wordBar
        self.spword = '@chunwei@'

        self.xmlph = self.path.g_document()
        self.wsplitph = self.path.g_wordsplit()
        self.wbpath = self.path.g_wordbar()

        #初始化词库
        self.wordbar = Thes.Create_Thesaurus(self.wbpath)

        #数据库相关
        self.cx = sq.connect(self.path.g_chun_sqlite())
        self.cu = self.cx.cursor()
Example #6
0
    def __init__(self,site_id):
        
        '''
        init
        '''
        self.path = path(site_id)

        #临时性设计 需要过滤无用字符

        self.str_test = re.compile("(\w|=|'|&|:)")


        self.length = 0


        self.cx = sq.connect(self.path.g_chun_sqlite())
        self.cu = self.cx.cursor()

        self.ict=Ictclas.Ictclas('ICTCLAS50/') 

        self.urlbar = urlbar(self.path.g_sorted_url())
Example #7
0
    def run(self, site_id):
        '''
        运行主程序
        与数据库配合 载入site_id 将相关信息载入
        '''
        #目录管理
        p = path(site_id)
        if os.path.exists('store/sites/' + str(site_id)):
            p.rm_file(p.g_urltest())
            p.clean_dir(p.g_document())
        else:
            os.mkdir('store/sites/' + str(site_id))

        runtime_queue = Queue()
        list = Urlist()
        Flock = threading.RLock()
        thlist = []

        site_infor = self.site.gets(site_id)
        startpage = site_infor[2]
        #尝试添加home_urls
        home_urls = site_infor[3].split('\r\n')
        head = site_infor[4]
        per_max_num = site_infor[5]

        for i in range(self.thread_num):
            #此处前缀也需要变化
            #修改  根据站点前缀命名爬虫
            th = reptile(site_id, head + str(i), runtime_queue, list,
                         per_max_num, Flock, home_urls)
            thlist.append(th)

        for i in thlist:

            i.start()

        runtime_queue.put(startpage)
Example #8
0
    def run(self,site_id):
        '''
        运行主程序
        与数据库配合 载入site_id 将相关信息载入
        '''
        #目录管理
        p = path(site_id)
        if os.path.exists('store/sites/'+str(site_id)):
            p.rm_file(p.g_urltest())
            p.clean_dir(p.g_document())
        else:
            os.mkdir('store/sites/'+str(site_id))

        runtime_queue = Queue() 
        list = Urlist()
        Flock = threading.RLock()  
        thlist = []

        site_infor = self.site.gets(site_id)
        startpage= site_infor[2]
        #尝试添加home_urls
        home_urls = site_infor[3].split('\r\n')
        head = site_infor[4]
        per_max_num = site_infor[5]

        for i in range(self.thread_num):  
            #此处前缀也需要变化
            #修改  根据站点前缀命名爬虫
            th = reptile(site_id,head + str(i), runtime_queue,list,per_max_num ,Flock,home_urls)
            thlist.append(th)  
            
        for i in thlist:  

            i.start()  

        runtime_queue.put(startpage)  
Example #9
0
from index.indexer import Indexer,Sort_hits
from query.path import path

p = path(0)
'''
index=Indexer(0)

index.run()
'''


hit_sort = Sort_hits(p.g_hit_size())

for i in range(20):
    hit_sort.sort_wid(p.g_hits()+'/',i)

    hit_sort.save(p.g_hits()+'/',i)
Example #10
0
    def run(self,site_id):
        '''
        运行主程序
        '''
        p = path(site_id)
        #初始化目录
        #清空wordsplit
        p.clean_dir(p.g_wordsplit())
        #清空hits
        p.clean_dir(p.g_hits())
        #初始化数据库
        p.cp_chun()

        ###################################################################
        #   解析  parser
        #
        ###################################################################
        parser = Parser(site_id)
        
        ###################################################################
        #   url处理 url_sort
        #
        ###################################################################
        url_trans = UrlTransID(p.g_urltest())
        #将url进行排序
        url_trans.sort()
        #存储为 sorted_url.txt
        url_trans.save(p.g_sorted_url())

        ###################################################################
        #   parser 进行分词
        #
        ###################################################################
        parser.splitWord()
        
        ###################################################################
        #  根据 docID 修改文件名
        #
        ###################################################################
        url_trans_dir = UrlTransDir(p.g_sorted_url())
        #对document进行重命名
        url_trans_dir.renameDoc( p.g_document() )
        #对wrdsplit进行重命名
        url_trans_dir.renameDoc( p.g_wordsplit() )
        
        ###################################################################
        #   parser 产生词库 wordbar
        #
        ###################################################################
        parser.transWbar()
        
        ###################################################################
        #   title_处理
        #
        ###################################################################
        title_des_sqlite = Title_des_sqlite(site_id)
        #对原始数据进行刷新
        title_des_sqlite.clear()
        title_des_sqlite.run()
        title_des_sqlite.add_url()
        title_des_sqlite.intro_split_des_title()
        title_des_sqlite.cx.commit()

        ###################################################################
        #   index 索引操作
        #
        ###################################################################
        index = Indexer(site_id)
        index.run()

        ###################################################################
        #   index 对hits进行排序
        #
        ###################################################################
        hit_sort = Sort_hits(p.g_hit_size())
        for i in range(20):
            hit_sort.sort_wid(p.g_hits()+'/',i)
            hit_sort.save(p.g_hits()+'/',i)
Example #11
0
    def main(self):
        '''
        主站 前期处理程序
        对于index 已经利用STEP
        统一了划分index的快数目
        基本思想:
            将 所有站点信息融合到一起 最后进行处理
        必须在 子站之前进行
        '''
        p = path(0)
        
        ###################################################################
        #   初始化路径
        #
        ###################################################################
        #清空document 为复制其余document作准备
        p.clean_dir(p.g_document())

        ###################################################################
        #   复制document
        #
        ###################################################################
        for li in os.listdir('store/sites'):
            print li
            site_document = os.path.join('store/sites',li,'document')
            print site_document
            
            #print site_path
            for f in os.listdir( site_document):
                #开始将每个文件复制到document中
                file_path = os.path.join(site_document,f)
                shutil.copyfile( file_path,'store/document/'+f)
                print 'successfully copy %s to document'%os.path.join(site_document,f)

        ###################################################################
        #   复制urltest
        #
        ###################################################################
        #现清空urltest
        p.rm_file( p.g_urltest() )
        
        u_file = open( p.g_urltest() , 'a+')
        
        for site_dir in os.listdir( 'store/sites/'):
            url_ph = os.path.join('store/sites',site_dir,'urltest.txt')
            f = open(url_ph)
            c = f.read()
            f.close()
            #附加到u_file后
            u_file.write(c)

        u_file.close()

        p.clean_dir(p.g_wordsplit())

        #清空hits
        p.clean_dir(p.g_hits())
        #初始化数据库
        p.cp_chun()
        
        ###################################################################
        #   复制urltest
        #
        ###################################################################
        #对总站点进行处理
        self.run(0)