class parser: def __init__(self,htmlph,xmlph,wsplitph,wbpath): reload(sys) sys.setdefaultencoding('utf-8') self.ict=Ictclas('ICTCLAS50/') self.wordbar=wordlist()#wordBar self.spword='@chunwei@' #区分内容的关键字 #设定相应路径 self.htmlph=htmlph self.xmlph=xmlph self.wsplitph=wsplitph self.wbpath=wbpath def transDoc(self): '将html源码转化为document文件' htmlli=os.listdir(self.htmlph)#取得html路径 num=0 for hp in htmlli: print hp f=open(self.htmlph+'/'+hp) c=f.read() #自动判别编码 并进行转化 res=chardet.detect(c) print res coding=res['encoding'] #print 'the former coding',coding if coding!='utf-8': try: c=c.decode(coding) except: print 'something wrong' collec=collector(c)#开始解析 f.close() f=open(self.xmlph+'/'+hp,'w') try: f.write(collec.xml(hp).toxml())#写入到新文件中 except: print 'can not trans xml' f.close() num+=1 def splitWord(self): '将document文件中的各项进行分词后 保存到新文件中' spword='@chunwei@' docli=os.listdir(self.xmlph+'/') num=0 for dp in docli: print dp #if num>1: # break #num+=1 f=open(self.xmlph+'/'+dp) c=f.read() if len(c)<200: continue #对空文件忽略 root=pq(c)#利用pyquery进行处理 f.close() #开始对各栏目进行处理 bb='' title=root('title').eq(0) bb+=self.ict.split( title.attr('text').encode('utf-8'))+' ' bb+=spword #b的处理 b=root('b item') length=len(b) for i in range(length): bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8'))+' ' bb+=spword #h1 b=root('h1 item') length=len(b) for i in range(length): bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8') )+' ' bb+=spword #h2 b=root('h2 item') length=len(b) for i in range(length): bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8') )+' ' bb+=spword #h3 b=root('h3 item') length=len(b) for i in range(length): bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8') ) +' ' bb+=spword #a b=root('a item') length=len(b) for i in range(length): self.ict.split( b.eq(i).attr('name').encode('utf-8') )+' ' bb+=spword #content content=root('content').eq(0) #print 'the content is ' #print content.text() bb+=self.ict.split( content.text().encode('utf-8'))+' ' #print 'the bb is' #print bb #save the result''' f=open(self.wsplitph+'/'+dp,'w+') f.write(bb) f.close() def __wordFind(self,strr): #print strr words=strr.split() flag=re.compile('\d') for i in words: if len(i)<=10: if i.find('=')>-1: continue if i.find('.')>-1: continue if flag.search(i): continue self.wordbar.find(i) def transWbar(self): '将已经分词的wordxml 分词为 wordBar 并且进行储存' li=os.listdir(self.wsplitph) for xml in li: print xml #开始解析分词 f=open(self.wsplitph+'/'+xml) c=f.read() f.close() #开始将文本整合 最后对str进行分词 for i in c.split(self.spword): self.__wordFind(i) #保存最后词库 strr='' for i in self.wordbar: #以字符串的形式保存 strr+=i+' ' f=open(self.wbpath,'w') f.write(strr) f.close() def _debug(self): f=open(self.wbpath) c=f.read() for i in c.split(): print i,hash(i)
class Query: '查询库' def __init__(self,pageph,hitph): 'init' self.ict=Ictclas('ICTCLAS50/') self.hitph=hitph self.pageph=pageph self.hitdoclist=Hitlist() #得分统计列表 self.wordbar=wordbar('../store/wordbar') #词库 以便得到wordID #hithash相关 self.hithasher=InitHashWid('../store/sortedwidhits','../store/hithash') self.hithasher.initHashWid()#初始化hithash #init rank total 单个doc的score总和 self.ranktotal=InitRankTotal('../store/sorteddochits','../store/tranks') self.ranktotal.initTotalRank() self.hits=[] #初始化pagerank self.pageranker=[] self.initPageranker() self.inithits()#初始化hits self.hithash=self.hithasher.hithash self.length=len(self.hits) #hits长度 #print 'length of hits is',self.length #排序 self.sorter=sorter() def initPageranker(self): print 'init pageranker' f=open(self.pageph) lines=f.readlines() f.close() for l in lines: self.pageranker.append(float(l)) def inithits(self): f=open(self.hitph) lines=f.readlines() f.close() for l in lines: self.hits.append(l.split()) def query(self,strr): '单个查询' words=self.wordsplit(strr) #分词后的查询结果 #print '分词结果为',words for word in words.split(): #对每个word进行处理 print '--start to query word--',word wordid=self.wordbar.find(word) #需要查询的wordID print '查得的wordID为',wordid if wordid: hithashpos=self.hithasher.find([wordid,0]) #hithasher返回的为目标数据在hithash中的位置 if hithashpos: starthitpos=int(self.hithash[hithashpos][1]) print '查得的hitpos为',starthitpos #得到wordID在hits表中的片段地址 starthitpos endhitpos print '开始地址',starthitpos if starthitpos+1<self.length: endhitpos=int(self.hithash[hithashpos+1][1])-1 else: endhitpos=starthitpos else: continue else: continue #开始扫描片段 进行加分计算 index=starthitpos print '结束地址',endhitpos while index<=endhitpos: #开始加分处理 self.hitdoclist.find(self.hits[index]) index+=1 #对结尾进行还原 print '对结尾进行还原' self.hitdoclist.InitStatus() print 'the former doclist---------------------------' for i in self.hitdoclist: print i #将score转化为相对score #print '开始转为相对score' for i,score in enumerate(self.hitdoclist): #调整精度 getcontext().prec = 6 docid=score[0] rankpos=self.ranktotal.find([docid,0])#返回记录的位置 perrank=0 #对于每个记录最终的page值综合 for j,total in enumerate(self.ranktotal.tranks[rankpos]): #开始对每个总值进行扫描 将最终结果保存到 self.hitdoclist[i][-1]中 if j>0: ranktotal=self.ranktotal.tranks[rankpos][j] if int(ranktotal)==0: self.hitdoclist[i][j]=0 else: self.hitdoclist[i][j]=float(score[j])/float(ranktotal) #开始将每个标签的rank添加到总rank中 #title:9 h1:3 h2:2 h3:2 b:1 a:0.5 content:0.5 #开始加入pageranker print 'now calculate the pageranker with the result' print 'the docid is',self.hitdoclist[i][0] #self.hitdoclist[i][-1]= self.pageranker[ int(self.hitdoclist[i][0])]*( self.hitdoclist[i][1]*0.5 + self.hitdoclist[i][2]*0.056+ self.hitdoclist[i][3]*0.167 +self.hitdoclist[i][4]*0.11 + self.hitdoclist[i][5]*0.11 +self.hitdoclist[i][6]*0.027 +self.hitdoclist[i][7]*0.027 ) self.hitdoclist[i][-1]=self.hitdoclist[i][1] for k,summ in enumerate(self.hitdoclist[i]): if k>0: self.hitdoclist[i][-1]+=summ print 'start to print the former hitdoclist' for i in self.hitdoclist: print i self.sorter.run(self.hitdoclist) print 'the result' self.sorter.showlist() #return self.getResList() #返回结果字符串 给服务器 def wordsplit(self,sentence): '将查询语句分词' return self.ict.split(sentence) def getResList(self): strr='' for i in self.hitdoclist: strr+=str(i[0])+' ' return strr
class Parser: ''' 解析库 将下载后的html源码同时转化为document 返回 部分功能嵌入到spider中 ''' def __init__(self,site_id): ''' 初始化各项目录 ''' self.path = path(site_id) self.ict=Ictclas('ICTCLAS50/') #self.wordbar=wordlist()#wordBar self.spword='@chunwei@' self.xmlph=self.path.g_document() self.wsplitph=self.path.g_wordsplit() self.wbpath=self.path.g_wordbar() #初始化词库 self.wordbar = Thes.Create_Thesaurus(self.wbpath) #数据库相关 self.cx = sq.connect(self.path.g_chun_sqlite()) self.cu = self.cx.cursor() def splitWord(self): ''' 转化为 wordsplit形式 格式为 <dom str> @chunwei@ <dim str> 直接使用了字符串进行分割 ''' spword='@chunwei@' docli=os.listdir(self.xmlph+'/') num=0 for dp in docli: #print dp f=open(self.xmlph+'/'+dp) c=f.read() if len(c)<200: continue root=pq(c) f.close() bb='' title=root('title').eq(0) bb+=self.ict.split( title.attr('text').encode('utf-8'))+' ' bb+=spword b=root('b item') length=len(b) for i in range(length): bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8'))+' ' bb+=spword #h1 b=root('h1 item') length=len(b) for i in range(length): bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8') )+' ' bb+=spword #h2 b=root('h2 item') length=len(b) for i in range(length): bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8') )+' ' bb+=spword #h3 b=root('h3 item') length=len(b) for i in range(length): bb+=self.ict.split( b.eq(i).attr('text').encode('utf-8') ) +' ' bb+=spword #a b=root('a item') length=len(b) for i in range(length): self.ict.split( b.eq(i).attr('name').encode('utf-8') )+' ' bb+=spword #content content=root('content').eq(0) #print 'the content is ' #print content.text() bb+=self.ict.split( content.text().encode('utf-8'))+' ' #print 'the bb is' #print bb #save the result''' f=open(self.wsplitph+'/'+dp,'w+') f.write(bb) f.close() def __wordFind(self,strr): #print strr words=strr.split() flag=re.compile('\d') for i in words: if len(i)<=10: if i.find('=')>-1: continue if i.find('.')>-1: continue if flag.search(i): continue self.wordbar.find(i) def transWbar(self): '词库初始化' li=os.listdir(self.wsplitph) for xml in li: f=open(self.wsplitph+'/'+xml) c=f.read() f.close() for i in c.split(self.spword): self.__wordFind(i) #print 'begin to find des' '''for i in self.get_split_des_words(xml): self.__wordFind(i) ''' strr='' #for i in self.wordbar.li: #strr+=i+' ' f=open(self.wbpath,'w') f.write(self.wordbar.get_words()) f.close() print 'begin to create hash' self.wordbar.create_hash(self.path.g_hash_index()) self.wordbar.save_wide(self.path.g_word_wide()) def get_split_des_words(self,docID): ''' 添加 des 的 hash ''' self.cu.execute("select des from lib where docID = %d"%int(docID)) li= self.cu.fetchone() if li[0]: return self.ict.split( str(li[0]) ).split() else: return [''] def _debug(self): f=open(self.wbpath) c=f.read() for i in c.split(): print i,hash(i)
# -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') from ICTCLAS50.Ictclas import Ictclas import chardet as cdt ict = Ictclas('ICTCLAS50/') words = ict.split("中国农业大学") print words for w in words: print w print cdt.detect(w)
class Parser: ''' 解析库 将下载后的html源码同时转化为document 返回 部分功能嵌入到spider中 ''' def __init__(self, site_id): ''' 初始化各项目录 ''' self.path = path(site_id) self.ict = Ictclas('ICTCLAS50/') #self.wordbar=wordlist()#wordBar self.spword = '@chunwei@' self.xmlph = self.path.g_document() self.wsplitph = self.path.g_wordsplit() self.wbpath = self.path.g_wordbar() #初始化词库 self.wordbar = Thes.Create_Thesaurus(self.wbpath) #数据库相关 self.cx = sq.connect(self.path.g_chun_sqlite()) self.cu = self.cx.cursor() def splitWord(self): ''' 转化为 wordsplit形式 格式为 <dom str> @chunwei@ <dim str> 直接使用了字符串进行分割 ''' spword = '@chunwei@' docli = os.listdir(self.xmlph + '/') num = 0 for dp in docli: #print dp f = open(self.xmlph + '/' + dp) c = f.read() if len(c) < 200: continue root = pq(c) f.close() bb = '' title = root('title').eq(0) bb += self.ict.split(title.attr('text').encode('utf-8')) + ' ' bb += spword b = root('b item') length = len(b) for i in range(length): bb += self.ict.split( b.eq(i).attr('text').encode('utf-8')) + ' ' bb += spword #h1 b = root('h1 item') length = len(b) for i in range(length): bb += self.ict.split( b.eq(i).attr('text').encode('utf-8')) + ' ' bb += spword #h2 b = root('h2 item') length = len(b) for i in range(length): bb += self.ict.split( b.eq(i).attr('text').encode('utf-8')) + ' ' bb += spword #h3 b = root('h3 item') length = len(b) for i in range(length): bb += self.ict.split( b.eq(i).attr('text').encode('utf-8')) + ' ' bb += spword #a b = root('a item') length = len(b) for i in range(length): self.ict.split(b.eq(i).attr('name').encode('utf-8')) + ' ' bb += spword #content content = root('content').eq(0) #print 'the content is ' #print content.text() bb += self.ict.split(content.text().encode('utf-8')) + ' ' #print 'the bb is' #print bb #save the result''' f = open(self.wsplitph + '/' + dp, 'w+') f.write(bb) f.close() def __wordFind(self, strr): #print strr words = strr.split() flag = re.compile('\d') for i in words: if len(i) <= 10: if i.find('=') > -1: continue if i.find('.') > -1: continue if flag.search(i): continue self.wordbar.find(i) def transWbar(self): '词库初始化' li = os.listdir(self.wsplitph) for xml in li: f = open(self.wsplitph + '/' + xml) c = f.read() f.close() for i in c.split(self.spword): self.__wordFind(i) #print 'begin to find des' '''for i in self.get_split_des_words(xml): self.__wordFind(i) ''' strr = '' #for i in self.wordbar.li: #strr+=i+' ' f = open(self.wbpath, 'w') f.write(self.wordbar.get_words()) f.close() print 'begin to create hash' self.wordbar.create_hash(self.path.g_hash_index()) self.wordbar.save_wide(self.path.g_word_wide()) def get_split_des_words(self, docID): ''' 添加 des 的 hash ''' self.cu.execute("select des from lib where docID = %d" % int(docID)) li = self.cu.fetchone() if li[0]: return self.ict.split(str(li[0])).split() else: return [''] def _debug(self): f = open(self.wbpath) c = f.read() for i in c.split(): print i, hash(i)
# -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') from ICTCLAS50.Ictclas import Ictclas import chardet as cdt ict=Ictclas('ICTCLAS50/') words = ict.split("中国农业大学") print words for w in words: print w print cdt.detect(w)
class parser: def __init__(self, htmlph, xmlph, wsplitph, wbpath): reload(sys) sys.setdefaultencoding('utf-8') self.ict = Ictclas('ICTCLAS50/') self.wordbar = wordlist() #wordBar self.spword = '@chunwei@' #区分内容的关键字 #设定相应路径 self.htmlph = htmlph self.xmlph = xmlph self.wsplitph = wsplitph self.wbpath = wbpath def transDoc(self): '将html源码转化为document文件' htmlli = os.listdir(self.htmlph) #取得html路径 num = 0 for hp in htmlli: print hp f = open(self.htmlph + '/' + hp) c = f.read() #自动判别编码 并进行转化 res = chardet.detect(c) print res coding = res['encoding'] #print 'the former coding',coding if coding != 'utf-8': try: c = c.decode(coding) except: print 'something wrong' collec = collector(c) #开始解析 f.close() f = open(self.xmlph + '/' + hp, 'w') try: f.write(collec.xml(hp).toxml()) #写入到新文件中 except: print 'can not trans xml' f.close() num += 1 def splitWord(self): '将document文件中的各项进行分词后 保存到新文件中' spword = '@chunwei@' docli = os.listdir(self.xmlph + '/') num = 0 for dp in docli: print dp #if num>1: # break #num+=1 f = open(self.xmlph + '/' + dp) c = f.read() if len(c) < 200: continue #对空文件忽略 root = pq(c) #利用pyquery进行处理 f.close() #开始对各栏目进行处理 bb = '' title = root('title').eq(0) bb += self.ict.split(title.attr('text').encode('utf-8')) + ' ' bb += spword #b的处理 b = root('b item') length = len(b) for i in range(length): bb += self.ict.split( b.eq(i).attr('text').encode('utf-8')) + ' ' bb += spword #h1 b = root('h1 item') length = len(b) for i in range(length): bb += self.ict.split( b.eq(i).attr('text').encode('utf-8')) + ' ' bb += spword #h2 b = root('h2 item') length = len(b) for i in range(length): bb += self.ict.split( b.eq(i).attr('text').encode('utf-8')) + ' ' bb += spword #h3 b = root('h3 item') length = len(b) for i in range(length): bb += self.ict.split( b.eq(i).attr('text').encode('utf-8')) + ' ' bb += spword #a b = root('a item') length = len(b) for i in range(length): self.ict.split(b.eq(i).attr('name').encode('utf-8')) + ' ' bb += spword #content content = root('content').eq(0) #print 'the content is ' #print content.text() bb += self.ict.split(content.text().encode('utf-8')) + ' ' #print 'the bb is' #print bb #save the result''' f = open(self.wsplitph + '/' + dp, 'w+') f.write(bb) f.close() def __wordFind(self, strr): #print strr words = strr.split() flag = re.compile('\d') for i in words: if len(i) <= 10: if i.find('=') > -1: continue if i.find('.') > -1: continue if flag.search(i): continue self.wordbar.find(i) def transWbar(self): '将已经分词的wordxml 分词为 wordBar 并且进行储存' li = os.listdir(self.wsplitph) for xml in li: print xml #开始解析分词 f = open(self.wsplitph + '/' + xml) c = f.read() f.close() #开始将文本整合 最后对str进行分词 for i in c.split(self.spword): self.__wordFind(i) #保存最后词库 strr = '' for i in self.wordbar: #以字符串的形式保存 strr += i + ' ' f = open(self.wbpath, 'w') f.write(strr) f.close() def _debug(self): f = open(self.wbpath) c = f.read() for i in c.split(): print i, hash(i)
class Query: '查询库' def __init__(self, pageph, hitph): 'init' self.ict = Ictclas('ICTCLAS50/') self.hitph = hitph self.pageph = pageph self.hitdoclist = Hitlist() #得分统计列表 self.wordbar = wordbar('../store/wordbar') #词库 以便得到wordID #hithash相关 self.hithasher = InitHashWid('../store/sortedwidhits', '../store/hithash') self.hithasher.initHashWid() #初始化hithash #init rank total 单个doc的score总和 self.ranktotal = InitRankTotal('../store/sorteddochits', '../store/tranks') self.ranktotal.initTotalRank() self.hits = [] #初始化pagerank self.pageranker = [] self.initPageranker() self.inithits() #初始化hits self.hithash = self.hithasher.hithash self.length = len(self.hits) #hits长度 #print 'length of hits is',self.length #排序 self.sorter = sorter() def initPageranker(self): print 'init pageranker' f = open(self.pageph) lines = f.readlines() f.close() for l in lines: self.pageranker.append(float(l)) def inithits(self): f = open(self.hitph) lines = f.readlines() f.close() for l in lines: self.hits.append(l.split()) def query(self, strr): '单个查询' words = self.wordsplit(strr) #分词后的查询结果 #print '分词结果为',words for word in words.split(): #对每个word进行处理 print '--start to query word--', word wordid = self.wordbar.find(word) #需要查询的wordID print '查得的wordID为', wordid if wordid: hithashpos = self.hithasher.find( [wordid, 0]) #hithasher返回的为目标数据在hithash中的位置 if hithashpos: starthitpos = int(self.hithash[hithashpos][1]) print '查得的hitpos为', starthitpos #得到wordID在hits表中的片段地址 starthitpos endhitpos print '开始地址', starthitpos if starthitpos + 1 < self.length: endhitpos = int(self.hithash[hithashpos + 1][1]) - 1 else: endhitpos = starthitpos else: continue else: continue #开始扫描片段 进行加分计算 index = starthitpos print '结束地址', endhitpos while index <= endhitpos: #开始加分处理 self.hitdoclist.find(self.hits[index]) index += 1 #对结尾进行还原 print '对结尾进行还原' self.hitdoclist.InitStatus() print 'the former doclist---------------------------' for i in self.hitdoclist: print i #将score转化为相对score #print '开始转为相对score' for i, score in enumerate(self.hitdoclist): #调整精度 getcontext().prec = 6 docid = score[0] rankpos = self.ranktotal.find([docid, 0]) #返回记录的位置 perrank = 0 #对于每个记录最终的page值综合 for j, total in enumerate(self.ranktotal.tranks[rankpos]): #开始对每个总值进行扫描 将最终结果保存到 self.hitdoclist[i][-1]中 if j > 0: ranktotal = self.ranktotal.tranks[rankpos][j] if int(ranktotal) == 0: self.hitdoclist[i][j] = 0 else: self.hitdoclist[i][j] = float( score[j]) / float(ranktotal) #开始将每个标签的rank添加到总rank中 #title:9 h1:3 h2:2 h3:2 b:1 a:0.5 content:0.5 #开始加入pageranker print 'now calculate the pageranker with the result' print 'the docid is', self.hitdoclist[i][0] #self.hitdoclist[i][-1]= self.pageranker[ int(self.hitdoclist[i][0])]*( self.hitdoclist[i][1]*0.5 + self.hitdoclist[i][2]*0.056+ self.hitdoclist[i][3]*0.167 +self.hitdoclist[i][4]*0.11 + self.hitdoclist[i][5]*0.11 +self.hitdoclist[i][6]*0.027 +self.hitdoclist[i][7]*0.027 ) self.hitdoclist[i][-1] = self.hitdoclist[i][1] for k, summ in enumerate(self.hitdoclist[i]): if k > 0: self.hitdoclist[i][-1] += summ print 'start to print the former hitdoclist' for i in self.hitdoclist: print i self.sorter.run(self.hitdoclist) print 'the result' self.sorter.showlist() #return self.getResList() #返回结果字符串 给服务器 def wordsplit(self, sentence): '将查询语句分词' return self.ict.split(sentence) def getResList(self): strr = '' for i in self.hitdoclist: strr += str(i[0]) + ' ' return strr