Example #1
0
    def __init__(self, pageph, hitph):
        'init'
        self.ict = Ictclas('ICTCLAS50/')
        self.hitph = hitph
        self.pageph = pageph
        self.hitdoclist = Hitlist()  #得分统计列表
        self.wordbar = wordbar('../store/wordbar')  #词库 以便得到wordID
        #hithash相关
        self.hithasher = InitHashWid('../store/sortedwidhits',
                                     '../store/hithash')
        self.hithasher.initHashWid()  #初始化hithash
        #init rank total 单个doc的score总和
        self.ranktotal = InitRankTotal('../store/sorteddochits',
                                       '../store/tranks')
        self.ranktotal.initTotalRank()

        self.hits = []
        #初始化pagerank
        self.pageranker = []
        self.initPageranker()
        self.inithits()  #初始化hits
        self.hithash = self.hithasher.hithash
        self.length = len(self.hits)  #hits长度
        #print 'length of hits is',self.length
        #排序
        self.sorter = sorter()
Example #2
0
    def __init__(self,pageph,hitph):
        'init'
        self.ict=Ictclas('ICTCLAS50/') 
        self.hitph=hitph
        self.pageph=pageph
        self.hitdoclist=Hitlist() #得分统计列表
        self.wordbar=wordbar('../store/wordbar') #词库 以便得到wordID
        #hithash相关
        self.hithasher=InitHashWid('../store/sortedwidhits','../store/hithash')
        self.hithasher.initHashWid()#初始化hithash
        #init rank total 单个doc的score总和
        self.ranktotal=InitRankTotal('../store/sorteddochits','../store/tranks')
        self.ranktotal.initTotalRank()

        self.hits=[]
        #初始化pagerank
        self.pageranker=[]
        self.initPageranker()
        self.inithits()#初始化hits
        self.hithash=self.hithasher.hithash
        self.length=len(self.hits) #hits长度
        #print 'length of hits is',self.length
        #排序
        self.sorter=sorter()
Example #3
0
class Query:
    '查询库'
    def __init__(self,pageph,hitph):
        'init'
        self.ict=Ictclas('ICTCLAS50/') 
        self.hitph=hitph
        self.pageph=pageph
        self.hitdoclist=Hitlist() #得分统计列表
        self.wordbar=wordbar('../store/wordbar') #词库 以便得到wordID
        #hithash相关
        self.hithasher=InitHashWid('../store/sortedwidhits','../store/hithash')
        self.hithasher.initHashWid()#初始化hithash
        #init rank total 单个doc的score总和
        self.ranktotal=InitRankTotal('../store/sorteddochits','../store/tranks')
        self.ranktotal.initTotalRank()

        self.hits=[]
        #初始化pagerank
        self.pageranker=[]
        self.initPageranker()
        self.inithits()#初始化hits
        self.hithash=self.hithasher.hithash
        self.length=len(self.hits) #hits长度
        #print 'length of hits is',self.length
        #排序
        self.sorter=sorter()

    def initPageranker(self):
        print 'init pageranker'
        f=open(self.pageph)
        lines=f.readlines()
        f.close()
        for l in lines:
            self.pageranker.append(float(l))
        

    def inithits(self):
        f=open(self.hitph)
        lines=f.readlines()
        f.close()
        for l in lines:
            self.hits.append(l.split())

    def query(self,strr):
        '单个查询'
        words=self.wordsplit(strr) #分词后的查询结果
        #print '分词结果为',words
        for word in words.split():
            #对每个word进行处理
            print '--start to query word--',word
            wordid=self.wordbar.find(word) #需要查询的wordID
            print '查得的wordID为',wordid
            if wordid:

                hithashpos=self.hithasher.find([wordid,0]) #hithasher返回的为目标数据在hithash中的位置

                if hithashpos:

                    starthitpos=int(self.hithash[hithashpos][1])
                    print '查得的hitpos为',starthitpos
                    #得到wordID在hits表中的片段地址 starthitpos  endhitpos
                    print '开始地址',starthitpos

                    if starthitpos+1<self.length:
                        endhitpos=int(self.hithash[hithashpos+1][1])-1
                    else:
                        endhitpos=starthitpos
                else:
                    continue
            else:
                continue

            #开始扫描片段 进行加分计算
            index=starthitpos
            print '结束地址',endhitpos

            while index<=endhitpos:
                #开始加分处理
                self.hitdoclist.find(self.hits[index])
                index+=1

        #对结尾进行还原
        print '对结尾进行还原'
        self.hitdoclist.InitStatus()

        print 'the former doclist---------------------------'
        for i in self.hitdoclist:
            print i

        #将score转化为相对score
        #print '开始转为相对score'

        for i,score in enumerate(self.hitdoclist):

            #调整精度
            getcontext().prec = 6
            docid=score[0]
            rankpos=self.ranktotal.find([docid,0])#返回记录的位置
            perrank=0 #对于每个记录最终的page值综合

            for j,total in enumerate(self.ranktotal.tranks[rankpos]):
                #开始对每个总值进行扫描 将最终结果保存到 self.hitdoclist[i][-1]中

                if j>0:

                    ranktotal=self.ranktotal.tranks[rankpos][j]
                    
                    if int(ranktotal)==0:
                        self.hitdoclist[i][j]=0
                    else:
                        self.hitdoclist[i][j]=float(score[j])/float(ranktotal)

            #开始将每个标签的rank添加到总rank中
            #title:9 h1:3 h2:2 h3:2 b:1 a:0.5 content:0.5
            #开始加入pageranker
            print 'now calculate the pageranker with the result'
            print 'the docid is',self.hitdoclist[i][0]

            #self.hitdoclist[i][-1]= self.pageranker[ int(self.hitdoclist[i][0])]*(  self.hitdoclist[i][1]*0.5 + self.hitdoclist[i][2]*0.056+ self.hitdoclist[i][3]*0.167 +self.hitdoclist[i][4]*0.11 + self.hitdoclist[i][5]*0.11  +self.hitdoclist[i][6]*0.027 +self.hitdoclist[i][7]*0.027 )
            self.hitdoclist[i][-1]=self.hitdoclist[i][1]
            for k,summ in enumerate(self.hitdoclist[i]):
                if k>0:
                    self.hitdoclist[i][-1]+=summ

        print 'start to print the former hitdoclist'
        for i in self.hitdoclist:
            print i
        self.sorter.run(self.hitdoclist)
        print 'the result'
        self.sorter.showlist()
        #return self.getResList() #返回结果字符串 给服务器

    def wordsplit(self,sentence):
        '将查询语句分词'
        return self.ict.split(sentence)

    def getResList(self):
        strr=''
        for i in self.hitdoclist:
            strr+=str(i[0])+' '
        return strr
Example #4
0
class Query:
    '查询库'

    def __init__(self, pageph, hitph):
        'init'
        self.ict = Ictclas('ICTCLAS50/')
        self.hitph = hitph
        self.pageph = pageph
        self.hitdoclist = Hitlist()  #得分统计列表
        self.wordbar = wordbar('../store/wordbar')  #词库 以便得到wordID
        #hithash相关
        self.hithasher = InitHashWid('../store/sortedwidhits',
                                     '../store/hithash')
        self.hithasher.initHashWid()  #初始化hithash
        #init rank total 单个doc的score总和
        self.ranktotal = InitRankTotal('../store/sorteddochits',
                                       '../store/tranks')
        self.ranktotal.initTotalRank()

        self.hits = []
        #初始化pagerank
        self.pageranker = []
        self.initPageranker()
        self.inithits()  #初始化hits
        self.hithash = self.hithasher.hithash
        self.length = len(self.hits)  #hits长度
        #print 'length of hits is',self.length
        #排序
        self.sorter = sorter()

    def initPageranker(self):
        print 'init pageranker'
        f = open(self.pageph)
        lines = f.readlines()
        f.close()
        for l in lines:
            self.pageranker.append(float(l))

    def inithits(self):
        f = open(self.hitph)
        lines = f.readlines()
        f.close()
        for l in lines:
            self.hits.append(l.split())

    def query(self, strr):
        '单个查询'
        words = self.wordsplit(strr)  #分词后的查询结果
        #print '分词结果为',words
        for word in words.split():
            #对每个word进行处理
            print '--start to query word--', word
            wordid = self.wordbar.find(word)  #需要查询的wordID
            print '查得的wordID为', wordid
            if wordid:

                hithashpos = self.hithasher.find(
                    [wordid, 0])  #hithasher返回的为目标数据在hithash中的位置

                if hithashpos:

                    starthitpos = int(self.hithash[hithashpos][1])
                    print '查得的hitpos为', starthitpos
                    #得到wordID在hits表中的片段地址 starthitpos  endhitpos
                    print '开始地址', starthitpos

                    if starthitpos + 1 < self.length:
                        endhitpos = int(self.hithash[hithashpos + 1][1]) - 1
                    else:
                        endhitpos = starthitpos
                else:
                    continue
            else:
                continue

            #开始扫描片段 进行加分计算
            index = starthitpos
            print '结束地址', endhitpos

            while index <= endhitpos:
                #开始加分处理
                self.hitdoclist.find(self.hits[index])
                index += 1

        #对结尾进行还原
        print '对结尾进行还原'
        self.hitdoclist.InitStatus()

        print 'the former doclist---------------------------'
        for i in self.hitdoclist:
            print i

        #将score转化为相对score
        #print '开始转为相对score'

        for i, score in enumerate(self.hitdoclist):

            #调整精度
            getcontext().prec = 6
            docid = score[0]
            rankpos = self.ranktotal.find([docid, 0])  #返回记录的位置
            perrank = 0  #对于每个记录最终的page值综合

            for j, total in enumerate(self.ranktotal.tranks[rankpos]):
                #开始对每个总值进行扫描 将最终结果保存到 self.hitdoclist[i][-1]中

                if j > 0:

                    ranktotal = self.ranktotal.tranks[rankpos][j]

                    if int(ranktotal) == 0:
                        self.hitdoclist[i][j] = 0
                    else:
                        self.hitdoclist[i][j] = float(
                            score[j]) / float(ranktotal)

            #开始将每个标签的rank添加到总rank中
            #title:9 h1:3 h2:2 h3:2 b:1 a:0.5 content:0.5
            #开始加入pageranker
            print 'now calculate the pageranker with the result'
            print 'the docid is', self.hitdoclist[i][0]

            #self.hitdoclist[i][-1]= self.pageranker[ int(self.hitdoclist[i][0])]*(  self.hitdoclist[i][1]*0.5 + self.hitdoclist[i][2]*0.056+ self.hitdoclist[i][3]*0.167 +self.hitdoclist[i][4]*0.11 + self.hitdoclist[i][5]*0.11  +self.hitdoclist[i][6]*0.027 +self.hitdoclist[i][7]*0.027 )
            self.hitdoclist[i][-1] = self.hitdoclist[i][1]
            for k, summ in enumerate(self.hitdoclist[i]):
                if k > 0:
                    self.hitdoclist[i][-1] += summ

        print 'start to print the former hitdoclist'
        for i in self.hitdoclist:
            print i
        self.sorter.run(self.hitdoclist)
        print 'the result'
        self.sorter.showlist()
        #return self.getResList() #返回结果字符串 给服务器

    def wordsplit(self, sentence):
        '将查询语句分词'
        return self.ict.split(sentence)

    def getResList(self):
        strr = ''
        for i in self.hitdoclist:
            strr += str(i[0]) + ' '
        return strr