コード例 #1
0
ファイル: Parser.py プロジェクト: Superjom/LAN-SEO
 def transDoc(self):
     '将html源码转化为document文件'
     htmlli=os.listdir(self.htmlph)#取得html路径
     num=0
     for hp in htmlli:
         print hp
         f=open(self.htmlph+'/'+hp)
         c=f.read()
         #自动判别编码 并进行转化
         res=chardet.detect(c)
         print res
         coding=res['encoding']
         #print 'the former coding',coding
         if coding!='utf-8':
             try:
                 c=c.decode(coding)
             except:
                 print 'something wrong'
         collec=collector(c)#开始解析
         f.close()
         f=open(self.xmlph+'/'+hp,'w')
         try:
             f.write(collec.xml(hp).toxml())#写入到新文件中
         except:
             print 'can not trans xml'
         f.close()
         num+=1
コード例 #2
0
ファイル: Reptile.py プロジェクト: Superjomn/qlin
    def __init__(self, site_id, Name, runtime_queue, list, per_max_num, Flcok,
                 home_urls):
        '''
        site_id:
            获得相应的目录
        '''

        threading.Thread.__init__(self, name=Name)
        self.runtime_queue = runtime_queue
        #self.result = result
        #路径管理
        self.path = path(site_id)

        self.num = 0
        self.maxnum = per_max_num
        self.list = list
        self.Flcok = Flcok
        #self.sqlite=sqlite3.connect('store/qlin.db')

        self.urltest = Urltest(home_urls)
        self.htmlparser = Collector()
        self.collector = collector(home_urls)
        #初始化home_list
        self.home_urls = home_urls
        self.inqueue = Queue()

        #开始对原始目录进行清扫
        #建立站点
        self.path.mk_dir(self.path.g_site())
        #urltest
        self.path.rm_file(self.path.g_urltest())
        #晴空document
        self.path.clean_dir(self.path.g_document())
コード例 #3
0
ファイル: Reptile.py プロジェクト: Rossonero/qlin
    def __init__(self,site_id, Name, runtime_queue, list, per_max_num ,Flcok,home_urls):  
        '''
        site_id:
            获得相应的目录
        '''

        threading.Thread.__init__(self, name = Name )  
        self.runtime_queue = runtime_queue  
        #self.result = result  
        #路径管理
        self.path = path(site_id)

        self.num = 0          
        self.maxnum = per_max_num
        self.list=list
        self.Flcok=Flcok
        #self.sqlite=sqlite3.connect('store/qlin.db')
         
        self.urltest=Urltest(home_urls)
        self.htmlparser=Collector()
        self.collector=collector(home_urls)
        #初始化home_list
        self.home_urls=home_urls
        self.inqueue = Queue()
        
        #开始对原始目录进行清扫
        #建立站点
        self.path.mk_dir( self.path.g_site() )
        #urltest
        self.path.rm_file( self.path.g_urltest() )
        #晴空document
        self.path.clean_dir( self.path.g_document() )
コード例 #4
0
 def transDoc(self):
     '将html源码转化为document文件'
     htmlli = os.listdir(self.htmlph)  #取得html路径
     num = 0
     for hp in htmlli:
         print hp
         f = open(self.htmlph + '/' + hp)
         c = f.read()
         #自动判别编码 并进行转化
         res = chardet.detect(c)
         print res
         coding = res['encoding']
         #print 'the former coding',coding
         if coding != 'utf-8':
             try:
                 c = c.decode(coding)
             except:
                 print 'something wrong'
         collec = collector(c)  #开始解析
         f.close()
         f = open(self.xmlph + '/' + hp, 'w')
         try:
             f.write(collec.xml(hp).toxml())  #写入到新文件中
         except:
             print 'can not trans xml'
         f.close()
         num += 1
コード例 #5
0
ファイル: Single_Reptile.py プロジェクト: Rossonero/qlin
    def __init__(self, Name, runtime_queue, list, per_max_num ,Flcok):  

        threading.Thread.__init__(self, name = Name )  
        self.runtime_queue = runtime_queue  
        #self.result = result  
        self.num = 0          
        self.maxnum = per_max_num
        self.list=list
        self.Flcok=Flcok
        #self.sqlite=sqlite3.connect('store/qlin.db')
        
        self.urltest=Urltest()
        self.htmlparser=Collector()
        self.collector=collector()
        #初始化home_list
        self.home_urls=[]
        self.inqueue = Queue()
コード例 #6
0
ファイル: Single_Reptile.py プロジェクト: Superjomn/qlin
    def __init__(self, Name, runtime_queue, list, per_max_num, Flcok):

        threading.Thread.__init__(self, name=Name)
        self.runtime_queue = runtime_queue
        #self.result = result
        self.num = 0
        self.maxnum = per_max_num
        self.list = list
        self.Flcok = Flcok
        #self.sqlite=sqlite3.connect('store/qlin.db')

        self.urltest = Urltest()
        self.htmlparser = Collector()
        self.collector = collector()
        #初始化home_list
        self.home_urls = []
        self.inqueue = Queue()