Esempio n. 1
0
    def __init__(self):
        self.uni=''
	self.pct=''
	self.page=0
	self.pagesize=10
	self.worddic={}
	self.wordlist=[]
	self.dq=DicQuery()
Esempio n. 2
0
class QueryObj(object):

    def __init__(self):
        self.uni=''
	self.pct=''
	self.page=0
	self.pagesize=10
	self.worddic={}
	self.wordlist=[]
	self.dq=DicQuery()

    def udngram(self):
	tail=[]
	getre=bngram.wordspliting(self.uni,"query")
	getre.start()
	getre.join()
	tail.append(getre)
	self.worddic, self.wordlist = self.dq.tailobject(tail)

    def dbinit(self):
	dbtype = db.DB_HASH
	dbopenflags = db.DB_THREAD
	envflags = db.DB_THREAD | db.DB_INIT_MPOOL | db.DB_INIT_LOCK
	dbenv = db.DBEnv()
	dbenv.set_mp_mmapsize(50*1024*1024)
	dbenv.set_cachesize(0,100*1024*1024,0)
	homeDir='database/world'
	dbenv.open(homeDir, envflags | db.DB_CREATE)
	self.tempdb=db.DB(dbenv)
	self.tempdb.open('tempwdb.db', mode=0660, dbtype=db.DB_HASH, flags=dbopenflags|db.DB_CREATE)

        wdbenv = db.DBEnv()
        envflags= db.DB_THREAD | db.DB_INIT_MPOOL | db.DB_INIT_LOCK
        wdbenv.set_mp_mmapsize(100*1024*1024)
        wdbenv.set_cachesize(0,200*1024*1024,0)
        homeDir='database/wpage/'
        wdbenv.open(homeDir, envflags|db.DB_CREATE)
        self.wpagedb=db.DB(wdbenv)
        self.wpagedb.open('wpagedb.db', None, db.DB_HASH, db.DB_DIRTY_READ)

	self.purei=Purecontent('r')
	self.serialdb=db.DB()
	self.serialdb.open('database/pureserial.db', None, db.DB_BTREE, db.DB_DIRTY_READ)
	self.urldb=db.DB()
	self.urldb.open('dic/anurl.db', None, db.DB_HASH, db.DB_DIRTY_READ)

    def dbclose(self):
	self.dq.dicclose()
	self.purei.close()
	self.urldb.close()
	self.tempdb.close()
	self.wpagedb.close()

    def wordmarkup(self,scorelist,startat):
        tempscorelist=[]
	at=scorelist[0][0]-startat
	for x in scorelist:
		start,end=x
		start=at+start-scorelist[0][0]
		end=at+end-scorelist[0][0]
		tempscorelist.append((start,end))
	return tempscorelist

    def uniques(self,wlist):
        u=[]
        for l in wlist:
                if l not in u and len(l)>0:
                        u.append(l)

        return u

    def dpickdump(self,picklist):
    	dpickstring=""
	for x in picklist:
	    if len(str(x[0]))<9:
	    	dpickstring=dpickstring+str(len(str(x[0])))+str(x[0])
	return dpickstring

    def dpickload(self,pick,wordsize):
        i=0
	b=[]
	while i<len(pick):
	    start=int(pick[1+i:1+i+int(pick[i])])
	    end=start+wordsize
	    b.append((start,end))
	    i=i+1+int(pick[i])
	return b

    def Stringload(self,UrlString):
        URLList={}
        for x in xrange(0,(len(UrlString)//6)):
	    URLList[UrlString[x*6:(1+x)*6]]=''
        return len(URLList),URLList

    def findpunctuation(self, pct, startat):
	until=0
	if startat > 20:
		until=20
	elif startat > 1:
		until=startat
	else:
		pass
	if until:
            for punctuationx in xrange(1,until):
	        if punctuationmarks(ord(pct[startat-punctuationx])):
	            startat=startat-punctuationx+1
	            break
	return startat

    def query(self):
	ST=time.time()
	conlinklist={}
	concountlist={}
	self.wordlist=self.uniques(self.wordlist)
	totalword=len(self.wordlist)
	for dbname in self.wordlist:
	    print self.worddic[dbname],
	    if self.tempdb.has_key(dbname):
	        concountlist[dbname],conlinklist[dbname]=self.Stringload(zlib.decompress(self.tempdb[dbname]))
	print "total word:", totalword
	# find the smallest size of conlinklist.
	mixSL=(0,0)
	for listcount in concountlist.keys():
	    if concountlist[listcount]>mixSL[1] or mixSL==(0,0):
	    	mixSL=(listcount,concountlist[listcount])
	# marge all of the key list.
	basiclist=conlinklist[mixSL[0]]
	for mx in conlinklist.keys():
	   templist={}
	   if mx != mixSL[0]:
	   	for iclb in conlinklist[mx].keys():
		     if basiclist.has_key(iclb):
		     	templist[iclb]=''
		basiclist=templist
	# The match key of every conlinklist is basiclist
	querydic={}
	
	tbs=len(basiclist)
	print 'est size',tbs,time.time()-ST
	brack=int(tbs//10000)
	if brack>0:
	    partsize=tbs//brack
	    lastpartsize=tbs%brack
	    # self.page*self.pagesize
	    matst=(self.page%brack)*partsize
	    if matst==0:
	        rlist=basiclist.keys()[matst:partsize+lastpartsize]
	    else:
	    	rlist=basiclist.keys()[matst+lastpartsize:matst+partsize]

	    
	    pagestartat=int(self.page//brack)*self.pagesize
	    #if (pagestartat+self.pagesize)>totalsize:
	    #	pageendat=totalsize
	    #else:
	    pageendat=pagestartat+self.pagesize
	else:
		rlist=basiclist.keys()

	RTS=0.0
	PT=time.time()
	tplist=''
	tpindex={}
	for wi in self.wordlist:
	    tplist=zlib.decompress(self.wpagedb.get(wi))
	    idx=0
	    if tplist:
	        while idx<len(tplist):
		    querykey=tplist[idx:idx+6] # querykey
		    tw=ord(tplist[idx+6]) # wordcount
		    nextidx=idx+7+3*(tw+1)
		    wposition=tplist[idx+7:nextidx]
		    idx=nextidx
		    tpindex[wi+querykey]=wposition
		    #print len(wi+querykey),tw,len(wposition)
	print time.time()-ST	
	for x in rlist:
	    #ptl,title=self.purei.querycontent(x)
	    if not querydic.has_key(x[0:4]):
	    	querydic[x[0:4]]={}
	    posilist=[]
	    ase=0
	    PT=time.time()
	    for pw in self.wordlist:
	    	# total size will be 6 bytes querykey + 1 bytes wordcount + wordcount * 3 bytes position.
		position=tpindex[pw+x]
		totalpositionsize=len(position)//3
		prseek=0
		for pr in xrange(totalpositionsize):
		    pse=position[prseek:prseek+3]
		    start=(ord(pse[0])<<8)+ord(pse[1])
		    end=start+ord(pse[2])
		    posilist.append((start,end))
		    prseek+=3
	    RTS=RTS+(time.time()-PT)
	    posilist.sort()
	    #if x[4:6]==chr(0)+chr(0):
	    #	print posilist
	    querydic[x[0:4]][x[4:6]]=posilist
	    #print posilist
	print RTS
	print len(querydic),time.time()-ST
	scorelist={}
	for senlist in querydic.keys():
	    #print contentprocess.asciitoint(senlist),':'
	    for ralist in querydic[senlist].keys():
	        #print chr(32)*4,contentprocess.asciitolin(ralist),'->',
		rx=querydic[senlist][ralist][0]
		x1,y1=rx[0],rx[1]
		#print rx[0],rx[1],
		i,c=1,0 # i and c value are working for the wordlink
		score=0
		x2,y2=0,0
		a=[]
		for rx in querydic[senlist][ralist][1:]:
		    x2,y2=rx[0],rx[1]
		    if y1==x2 or (y1-1)==x2:
		    	x1,y1=(x1,y2)
			i,c=1,1
		    elif i==1 and c==0:
		    	i=0
			a.append((x1,y1))
			score=score+(y1-x1-2)
			x1,y1=(x2,y2)
			#pse=rx[2]
		    else:
			a.append((x1,y1))
			score=score+(y1-x1-2)
			x1,y1=(x2,y2)
			pse=''
		    c=0
		if i==1:
		    a.append((x1,y1))
		    score=score+(y1-x1-2)
		if a[-1][1]!=y2 and y2!=0:
		    a.append((x2,y2))
		if a:
		    querydic[senlist][ralist]=a
		scorelist[senlist+ralist]=score
	# choucing the bast part of dest
	print "Scorelist:",time.time()-ST
	bastlist=[]
	subjectdic={}
	for senlist in querydic.keys():
	    bscore=0
	    addscore=0
	    tl=''
	    for ralist in querydic[senlist].keys():
		if ralist == chr(0)+chr(0):
		    subjectdic[senlist]=querydic[senlist][ralist]
	        elif scorelist.has_key(senlist+chr(0)+chr(0)):
		    score=scorelist[senlist+chr(0)+chr(0)]+scorelist[senlist+ralist]
		    if score>bscore or bscore==0:
			bscore=score+10
		        tl=ralist
		else:
		    score=scorelist[senlist+ralist]
		    if score>bscore or bscore==0:
		        bscore=score
		        tl=ralist
	    if len(tl):
	        bastlist.append((senlist+tl,bscore))
	ralist=[]
	print "bastlist:",time.time()-ST
	scoredlist=sorted(bastlist, key=operator.itemgetter(1),reverse=True)
        totalsize=len(scoredlist)
	
	if brack==0:
            pagestartat=self.page*self.pagesize
            if (pagestartat+self.pagesize)>totalsize:
                pageendat=totalsize
            else:
                pageendat=pagestartat+self.pagesize
	else:
		totalsize=tbs

	for ikey,score in scoredlist[pagestartat:pageendat]:
	    ptl,title=self.purei.querycontent(ikey) 
	    qstart=querydic[ikey[0:4]][ikey[4:6]][0][0]
	    startat=self.findpunctuation(ptl,qstart)
	    backoff=startat-qstart
	    bdest=0
	    if len(ptl)-startat<10:
	    	bdest=(startat+10-len(ptl))
	    startat=(startat-bdest)
	    #backoff=(backoff-bdest)
	    #print qstart,len(ptl),startat,backoff
	    positionlist=[]
	    for position in querydic[ikey[0:4]][ikey[4:6]]:
		if position[0]-startat>100:
		    break
		else:
		    positionlist.append((position[0]-startat,position[1]-startat))
		    #print position[0]-startat,position[1]-startat
	    url=urllib.quote(self.urldb[self.serialdb[ikey[0:4]]])
	    if subjectdic.has_key(ikey[0:4]):
	    	titleposition=subjectdic[ikey[0:4]]
	    else:
		titleposition=''
	    ralist.append((ptl[startat:startat+100],score,'',url,title,positionlist,titleposition))
	print time.time()-ST
	return (totalsize,ralist)

    def contentquery(self): 
        ST=time.time()
	urllist=[]
        totalword=len(self.wordlist)
	for x in self.wordlist:
		print self.worddic[x]
        print "total word:", totalword
	for dbname in self.wordlist:
	    if self.tempdb.has_key(dbname):
	       urllist=urllist+self.Stringload(zlib.decompress(self.tempdb[dbname]))
        if totalword>1:
	    ranklist=ranking.ranking(urllist)
	    urlcomp=ranklist.dicuniques(totalword)
        else:
	    urlcomp=urllist
	totalsize=len(urlcomp)
	ralist=[]
	pagestartat=self.page*self.pagesize
	if (pagestartat+self.pagesize)>totalsize:
		pageendat=totalsize
	else:
		pageendat=pagestartat+self.pagesize
	
	print time.time()-ST

        if totalword!=1:
	    rangestsart=0
	    rangeend=totalsize
	    if totalsize>500 or pagestartat>=500:
	    	rangestsart=(pagestartat//500)*500
		rangeend=rangestsart+500
		pagestartat=pagestartat-rangestsart
		pageendat=pagestartat+self.pagesize
	    #for i in xrange(0,totalsize):
	    count=0
	    searchtime=0.0
	    linktime=0.0
	    for i in xrange(rangestsart,rangeend):
		bastscore=0
	        mirs=0
	        spliturl=urlcomp[i]
		if totalword>=3:
			sword=3
		else:
			sword=totalword

		if len(spliturl)==2 and spliturl[0]==totalword:
		    at=time.time()
		    self.pct,title=self.purei.queryPurecontent(spliturl[1])
		    bt=time.time()-at
		    matchstart=0
		    scorelist=[]

		    searchtime=searchtime+bt
		    for match in re.finditer(self.uni.decode("utf-8"),self.pct):
			matchstart=match.start()
		    if matchstart:
			bastscore=60
			if (matchstart+150)> len(self.pct):
				mirs=len(self.pct)-matchstart
			scorelist.append((match.start(),match.end()))
			startat=self.findpunctuation(matchstart)
			scorelist=self.wordmarkup(scorelist,startat-mirs)
			abstract=startat-mirs
			destcontent=self.pct[abstract:abstract+150]
			url=urllib.quote(self.urldb[self.serialdb[spliturl[1]]])
			ralist.append((destcontent,bastscore,str(spliturl[0]),url,title,scorelist))

		if len(spliturl)==2 and spliturl[0]>=sword and bastscore==0:
		    at=time.time()
		    # self.pct,title=self.purei.queryPurecontent(spliturl[1])
		    r=[]
		    for dbname in self.wordlist:
			if self.tempdb.has_key(dbname):
				    picklelist=[]
				    for match in re.finditer(self.worddic[dbname],self.pct):
					picklelist.append((match.start(),match.end()))
				    r=r+picklelist
		    r=sorted(r, key=operator.itemgetter(0))
		    r=ranklist.wordlinker(r)
		    bastscore, scorelist = ranklist.counttheimportantpart(r)
		    #print scorelist
		    if len(scorelist)>0:
		        startat=scorelist[0][0]
		        startat=self.findpunctuation(startat)
		        if (startat+150)> len(self.pct):
		            mirs=len(self.pct)-startat
		        scorelist=self.wordmarkup(scorelist,startat-mirs)
		        abstract=startat-mirs
		        destcontent=self.pct[abstract:abstract+150]
		        url=urllib.quote(self.urldb[self.serialdb[spliturl[1]]])
		        ralist.append((destcontent,bastscore,str(spliturl[0]),url,title,scorelist))
		    bt=time.time()-at
		    linktime=linktime+bt

	print 'totalword2:',
	print time.time()-ST

        if totalword==1:
	    for i in xrange(pagestartat,pageendat):
                bastscore=0
                mirs=0
                spliturl=urlcomp[i]
		self.pct,title=self.purei.queryPurecontent(spliturl)
		matchstart=0
		scorelist=[]
		picklelist=[]
		for match in re.finditer(self.uni,self.pct):
                    matchstart=match.start()
		    picklelist.append((match.start(),match.end()))
                if (matchstart+100) > len(self.pct):
                    mirs=len(self.pct)-matchstart
		scorelist=picklelist
		startat=scorelist[0][0]
		startat=self.findpunctuation(startat)
		scorelist=self.wordmarkup(scorelist,startat-mirs)
		abstract=startat-mirs
		destcontent=self.pct[abstract:abstract+150]
		url=urllib.quote(self.urldb[self.serialdb[spliturl[0:4]]])
		#print destcontent,str(1),url,title,scorelist
		ralist.append((destcontent,100,str(1),url,title,scorelist))
	    print 'totalword1:',
	    print time.time()-ST
	    return (totalsize,sorted(ralist, key=operator.itemgetter(1),reverse=True))
	print "search:",str(searchtime)
	print "Link:",str(linktime)
	return (totalsize,sorted(ralist, key=operator.itemgetter(1),reverse=True)[pagestartat:pageendat])