def dbinit(self): dbtype = db.DB_HASH dbopenflags = db.DB_THREAD envflags = db.DB_THREAD | db.DB_INIT_MPOOL | db.DB_INIT_LOCK dbenv = db.DBEnv() dbenv.set_mp_mmapsize(50*1024*1024) dbenv.set_cachesize(0,100*1024*1024,0) homeDir='database/world' dbenv.open(homeDir, envflags | db.DB_CREATE) self.tempdb=db.DB(dbenv) self.tempdb.open('tempwdb.db', mode=0660, dbtype=db.DB_HASH, flags=dbopenflags|db.DB_CREATE) wdbenv = db.DBEnv() envflags= db.DB_THREAD | db.DB_INIT_MPOOL | db.DB_INIT_LOCK wdbenv.set_mp_mmapsize(100*1024*1024) wdbenv.set_cachesize(0,200*1024*1024,0) homeDir='database/wpage/' wdbenv.open(homeDir, envflags|db.DB_CREATE) self.wpagedb=db.DB(wdbenv) self.wpagedb.open('wpagedb.db', None, db.DB_HASH, db.DB_DIRTY_READ) self.purei=Purecontent('r') self.serialdb=db.DB() self.serialdb.open('database/pureserial.db', None, db.DB_BTREE, db.DB_DIRTY_READ) self.urldb=db.DB() self.urldb.open('dic/anurl.db', None, db.DB_HASH, db.DB_DIRTY_READ)
def __init__(self): self.uni = "" self.title = "" self.content = "" self.md5urllist = {} self.purei = Purecontent("c") self.urlinsert = DataInsert() self.urlinsert.urldbinit()
def linesplitinster(md5urllist): purei = Purecontent("r") total = len(md5urllist) wordi = TextInsert() wsynccount = 0 for md5url in md5urllist.keys(): st = time.time() tail = [] totaldic = 0 totalcomp = 0 pureserial = purei.queryserial(md5url) if purei.querycontentcount(pureserial): purecount = int(purei.querycontentcount(pureserial)) + 1 else: purecount = 0 for seri in xrange(purecount): querykey = pureserial + contentprocess.lintoascii(seri) while count_active(tail) >= config.splitercpu: time.sleep(0.5) getre = bngram.wordspliting(purei.querycontentinline(querykey), querykey) tail.append(getre) getre.start() # execute getre.run() dba = DataInsert() dba.outdicdbinit() # open the word database which are out of dic dba.companwordcount = 0 wa = 0 # if we have to reload anuutf-8 dic for splitterlist in tail: splitterlist.join(config.splitertimeout) totalcomp = totalcomp + len(splitterlist.companword) totaldic = totaldic + len(splitterlist.dicword) dba.wordlist = splitterlist.companword if dba.wordlist: dba.anuworddb() wa = 1 dba.outdicdbclose() if wa: wordi.anureload() # print dba.companwordcount,totalcomp,totaldic # wordi=TextInsert() for splitterlist in tail: if splitterlist.dicword: wordi.getdicdb = 1 wordi.dicword = splitterlist.dicword wordi.tempwurl(splitterlist.querykey) if splitterlist.companword: wordi.getdicdb = 2 wordi.dicword = splitterlist.companword wordi.tempwurl(splitterlist.querykey) tail = [] # print time.time()-st wsynccount += 1 if wsynccount > 8192: stderr.write("dbsync") wordi.sync_wpage() wsynccount = 0 if reloadxmlrpcd(): stderr.write("+") stderr.write(".") title, word = "", "" stderr.write("dbsync") wordi.sync_wpage() if reloadxmlrpcd(): stderr.write("+") wordi.closedicdb() purei.close()
class Contentprocess(object): def __init__(self): self.uni = "" self.title = "" self.content = "" self.md5urllist = {} self.purei = Purecontent("c") self.urlinsert = DataInsert() self.urlinsert.urldbinit() def closeandreturn(self): self.purei.close() self.urlinsert.urldbclose() return self.md5urllist def contentadd(self, largeinsert): for x in largeinsert.keys(): self.uni = x cdata = largeinsert[x] self.title = cdata[0] self.content = cdata[1] self.contentinsert() def contentinsert(self): md5url = hashlib.md5(self.uni).hexdigest() self.purei.url_md5 = md5url self.md5urllist[md5url] = self.uni # url db self.urlinsert.url = self.uni self.urlinsert.md5url = md5url self.urlinsert.inserturldb() stmk = stopmarks() if self.purei.checkexist(): self.purei.title = self.title.encode("utf-8") context = "" word = self.content n = 0 for xw in word: if ord(xw) >= 32 or ord(xw) in [9, 10, 13]: context = context + xw n += 1 if n > 40000000: # may over 65535 line of a document. break context = context + chr(32) contline = [] contline.append("") word = "" # release word value i = 0 # line of contline list x = 0 # word number msl = 260 while x < len(context): ordx = ord(context[x]) contline[i] = contline[i] + context[x] sentencecount = len(clearspace((contline[i]))) if ( sentencecount > msl and stmk.atypestopmarks(ordx) or sentencecount > msl and context[x : x + 2] == ". " or sentencecount > msl + 20 and stmk.btypestopmarks(ordx) or sentencecount > msl + 20 and ordx == 10 and ord(context[x + 1 : x + 2]) < 65 ): nextword = context[x + 1 : x + 2] if nextword: if punctuationmarks(ord(nextword)): # at some case, chinese word will use two marks. x += 1 contline[i] = contline[i] + context[x] contline.append("") i = len(contline) - 1 if msl <= 16640 and i % 2: msl = msl + msl # Dobule it, Until this value bigger then 16640. x += 1 if sentencecount < msl: contline[i] = contline[i] + context[x : x + msl] x = x + msl contcleanline = [] i = 0 # i for contline for x in contline: cont = clearspace(x) if len(cont) > 1: if cont[0] == chr(32) and cont[-1] == chr(32): cont = cont[1:-1] elif cont[-1] == chr(32): cont = cont[:-1] elif cont[0] == chr(32): cont = cont[1:] if len(cont) < 65025 and cont != chr(32): contcleanline.append(cont.encode("utf-8")) i = i + 1 self.purei.purecotentinline = contcleanline self.purei.content = clearspace(context).encode("utf-8") self.purei.insertPurecontent() stderr.write(".")
def OriginalHTMLprocess(listsplit): OriginalHTMLdb = OriginalPage() ilog = infologger() purei = Purecontent("c") pat = re.compile("<([^>]|\n)*>") space = re.compile("\ \;|\©\;|\r|\t") stmk = stopmarks() md5urllist = {} for i in listsplit: md5url = md5hex(i) md5urllist[md5url] = [i] word = "" st = time.time() purei.url_md5 = md5url if purei.checkexist(): OriginalHTMLdb.url = i parser = html2txt() try: parser.feed(OriginalHTMLdb.queryoriginalct()) charset = parser.charset # charset detector parser.close() except: charset = "" Originaltext = langconvert(OriginalHTMLdb.queryoriginalct(), charset) Originaltext = Originaltext.decode("utf-8") ilog.sentence_split_info(time.time() - st) try: # If this page is normal html format parser = "" parser = html2txt() parser.feed(Originaltext) word = word + parser.text if len(word) == 0: word = word + space.sub(chr(32), pat.sub(chr(32), Originaltext)) contenttitle = clearspace(parser.title) parser.close() # print contenttitle,i,charset purei.title = contenttitle.encode("utf-8") except: try: parser = html2txt() parser.feed(Originaltext) contenttitle = clearspace(parser.title) parser.close() except: contenttitle = "" purei.title = contenttitle.encode("utf-8") word = word + space.sub(chr(32), pat.sub(chr(32), Originaltext)) context = "" ilog.sentence_split_info(time.time() - st) n = 0 for xw in word: if ord(xw) >= 32 or ord(xw) in [9, 10, 13]: context = context + xw n += 1 if n > 40000000: # may over 65535 line of a document. break ilog.sentence_split_info(purei.title + str(len(context)) + i + charset) context = context + chr(32) contline = [] contline.append("") i = 0 # line of contline list # for x in xrange(len(context)): x = 0 # word number msl = 260 while x < len(context): ordx = ord(context[x]) contline[i] = contline[i] + context[x] sentencecount = len(clearspace((contline[i]))) # sentencecount=len(contline[i]) if ( sentencecount > msl and stmk.atypestopmarks(ordx) or sentencecount > msl and context[x : x + 2] == ". " or sentencecount > msl + 20 and stmk.btypestopmarks(ordx) or sentencecount > msl + 20 and ordx == 10 and ord(context[x + 1 : x + 2]) < 65 ): nextword = context[x + 1 : x + 2] if nextword: if punctuationmarks(ord(nextword)): # at some case, chinese word will use two marks. x += 1 contline[i] = contline[i] + context[x] contline.append("") i += 1 if msl <= 16640 and i % 2: msl = msl + msl # Dobule it, Until this value bigger then 4000. x += 1 if sentencecount < msl: contline[i] = contline[i] + context[x : x + msl] x = x + msl contcleanline = [] i = 0 ilog.sentence_split_info(time.time() - st) for x in contline: cont = clearspace(x) if len(cont) > 1: if cont[0] == chr(32) and cont[-1] == chr(32): cont = cont[1:-1] elif cont[-1] == chr(32): cont = cont[:-1] elif cont[0] == chr(32): cont = cont[1:] if len(cont) < 65025 and cont != chr(32): contcleanline.append(cont.encode("utf-8")) i = i + 1 ilog.sentence_split_info(time.time() - st) purei.purecotentinline = contcleanline purei.content = clearspace(context).encode("utf-8") purei.insertPurecontent() stderr.write(".") OriginalHTMLdb.close() purei.close() return md5urllist
class QueryObj(object): def __init__(self): self.uni='' self.pct='' self.page=0 self.pagesize=10 self.worddic={} self.wordlist=[] self.dq=DicQuery() def udngram(self): tail=[] getre=bngram.wordspliting(self.uni,"query") getre.start() getre.join() tail.append(getre) self.worddic, self.wordlist = self.dq.tailobject(tail) def dbinit(self): dbtype = db.DB_HASH dbopenflags = db.DB_THREAD envflags = db.DB_THREAD | db.DB_INIT_MPOOL | db.DB_INIT_LOCK dbenv = db.DBEnv() dbenv.set_mp_mmapsize(50*1024*1024) dbenv.set_cachesize(0,100*1024*1024,0) homeDir='database/world' dbenv.open(homeDir, envflags | db.DB_CREATE) self.tempdb=db.DB(dbenv) self.tempdb.open('tempwdb.db', mode=0660, dbtype=db.DB_HASH, flags=dbopenflags|db.DB_CREATE) wdbenv = db.DBEnv() envflags= db.DB_THREAD | db.DB_INIT_MPOOL | db.DB_INIT_LOCK wdbenv.set_mp_mmapsize(100*1024*1024) wdbenv.set_cachesize(0,200*1024*1024,0) homeDir='database/wpage/' wdbenv.open(homeDir, envflags|db.DB_CREATE) self.wpagedb=db.DB(wdbenv) self.wpagedb.open('wpagedb.db', None, db.DB_HASH, db.DB_DIRTY_READ) self.purei=Purecontent('r') self.serialdb=db.DB() self.serialdb.open('database/pureserial.db', None, db.DB_BTREE, db.DB_DIRTY_READ) self.urldb=db.DB() self.urldb.open('dic/anurl.db', None, db.DB_HASH, db.DB_DIRTY_READ) def dbclose(self): self.dq.dicclose() self.purei.close() self.urldb.close() self.tempdb.close() self.wpagedb.close() def wordmarkup(self,scorelist,startat): tempscorelist=[] at=scorelist[0][0]-startat for x in scorelist: start,end=x start=at+start-scorelist[0][0] end=at+end-scorelist[0][0] tempscorelist.append((start,end)) return tempscorelist def uniques(self,wlist): u=[] for l in wlist: if l not in u and len(l)>0: u.append(l) return u def dpickdump(self,picklist): dpickstring="" for x in picklist: if len(str(x[0]))<9: dpickstring=dpickstring+str(len(str(x[0])))+str(x[0]) return dpickstring def dpickload(self,pick,wordsize): i=0 b=[] while i<len(pick): start=int(pick[1+i:1+i+int(pick[i])]) end=start+wordsize b.append((start,end)) i=i+1+int(pick[i]) return b def Stringload(self,UrlString): URLList={} for x in xrange(0,(len(UrlString)//6)): URLList[UrlString[x*6:(1+x)*6]]='' return len(URLList),URLList def findpunctuation(self, pct, startat): until=0 if startat > 20: until=20 elif startat > 1: until=startat else: pass if until: for punctuationx in xrange(1,until): if punctuationmarks(ord(pct[startat-punctuationx])): startat=startat-punctuationx+1 break return startat def query(self): ST=time.time() conlinklist={} concountlist={} self.wordlist=self.uniques(self.wordlist) totalword=len(self.wordlist) for dbname in self.wordlist: print self.worddic[dbname], if self.tempdb.has_key(dbname): concountlist[dbname],conlinklist[dbname]=self.Stringload(zlib.decompress(self.tempdb[dbname])) print "total word:", totalword # find the smallest size of conlinklist. mixSL=(0,0) for listcount in concountlist.keys(): if concountlist[listcount]>mixSL[1] or mixSL==(0,0): mixSL=(listcount,concountlist[listcount]) # marge all of the key list. basiclist=conlinklist[mixSL[0]] for mx in conlinklist.keys(): templist={} if mx != mixSL[0]: for iclb in conlinklist[mx].keys(): if basiclist.has_key(iclb): templist[iclb]='' basiclist=templist # The match key of every conlinklist is basiclist querydic={} tbs=len(basiclist) print 'est size',tbs,time.time()-ST brack=int(tbs//10000) if brack>0: partsize=tbs//brack lastpartsize=tbs%brack # self.page*self.pagesize matst=(self.page%brack)*partsize if matst==0: rlist=basiclist.keys()[matst:partsize+lastpartsize] else: rlist=basiclist.keys()[matst+lastpartsize:matst+partsize] pagestartat=int(self.page//brack)*self.pagesize #if (pagestartat+self.pagesize)>totalsize: # pageendat=totalsize #else: pageendat=pagestartat+self.pagesize else: rlist=basiclist.keys() RTS=0.0 PT=time.time() tplist='' tpindex={} for wi in self.wordlist: tplist=zlib.decompress(self.wpagedb.get(wi)) idx=0 if tplist: while idx<len(tplist): querykey=tplist[idx:idx+6] # querykey tw=ord(tplist[idx+6]) # wordcount nextidx=idx+7+3*(tw+1) wposition=tplist[idx+7:nextidx] idx=nextidx tpindex[wi+querykey]=wposition #print len(wi+querykey),tw,len(wposition) print time.time()-ST for x in rlist: #ptl,title=self.purei.querycontent(x) if not querydic.has_key(x[0:4]): querydic[x[0:4]]={} posilist=[] ase=0 PT=time.time() for pw in self.wordlist: # total size will be 6 bytes querykey + 1 bytes wordcount + wordcount * 3 bytes position. position=tpindex[pw+x] totalpositionsize=len(position)//3 prseek=0 for pr in xrange(totalpositionsize): pse=position[prseek:prseek+3] start=(ord(pse[0])<<8)+ord(pse[1]) end=start+ord(pse[2]) posilist.append((start,end)) prseek+=3 RTS=RTS+(time.time()-PT) posilist.sort() #if x[4:6]==chr(0)+chr(0): # print posilist querydic[x[0:4]][x[4:6]]=posilist #print posilist print RTS print len(querydic),time.time()-ST scorelist={} for senlist in querydic.keys(): #print contentprocess.asciitoint(senlist),':' for ralist in querydic[senlist].keys(): #print chr(32)*4,contentprocess.asciitolin(ralist),'->', rx=querydic[senlist][ralist][0] x1,y1=rx[0],rx[1] #print rx[0],rx[1], i,c=1,0 # i and c value are working for the wordlink score=0 x2,y2=0,0 a=[] for rx in querydic[senlist][ralist][1:]: x2,y2=rx[0],rx[1] if y1==x2 or (y1-1)==x2: x1,y1=(x1,y2) i,c=1,1 elif i==1 and c==0: i=0 a.append((x1,y1)) score=score+(y1-x1-2) x1,y1=(x2,y2) #pse=rx[2] else: a.append((x1,y1)) score=score+(y1-x1-2) x1,y1=(x2,y2) pse='' c=0 if i==1: a.append((x1,y1)) score=score+(y1-x1-2) if a[-1][1]!=y2 and y2!=0: a.append((x2,y2)) if a: querydic[senlist][ralist]=a scorelist[senlist+ralist]=score # choucing the bast part of dest print "Scorelist:",time.time()-ST bastlist=[] subjectdic={} for senlist in querydic.keys(): bscore=0 addscore=0 tl='' for ralist in querydic[senlist].keys(): if ralist == chr(0)+chr(0): subjectdic[senlist]=querydic[senlist][ralist] elif scorelist.has_key(senlist+chr(0)+chr(0)): score=scorelist[senlist+chr(0)+chr(0)]+scorelist[senlist+ralist] if score>bscore or bscore==0: bscore=score+10 tl=ralist else: score=scorelist[senlist+ralist] if score>bscore or bscore==0: bscore=score tl=ralist if len(tl): bastlist.append((senlist+tl,bscore)) ralist=[] print "bastlist:",time.time()-ST scoredlist=sorted(bastlist, key=operator.itemgetter(1),reverse=True) totalsize=len(scoredlist) if brack==0: pagestartat=self.page*self.pagesize if (pagestartat+self.pagesize)>totalsize: pageendat=totalsize else: pageendat=pagestartat+self.pagesize else: totalsize=tbs for ikey,score in scoredlist[pagestartat:pageendat]: ptl,title=self.purei.querycontent(ikey) qstart=querydic[ikey[0:4]][ikey[4:6]][0][0] startat=self.findpunctuation(ptl,qstart) backoff=startat-qstart bdest=0 if len(ptl)-startat<10: bdest=(startat+10-len(ptl)) startat=(startat-bdest) #backoff=(backoff-bdest) #print qstart,len(ptl),startat,backoff positionlist=[] for position in querydic[ikey[0:4]][ikey[4:6]]: if position[0]-startat>100: break else: positionlist.append((position[0]-startat,position[1]-startat)) #print position[0]-startat,position[1]-startat url=urllib.quote(self.urldb[self.serialdb[ikey[0:4]]]) if subjectdic.has_key(ikey[0:4]): titleposition=subjectdic[ikey[0:4]] else: titleposition='' ralist.append((ptl[startat:startat+100],score,'',url,title,positionlist,titleposition)) print time.time()-ST return (totalsize,ralist) def contentquery(self): ST=time.time() urllist=[] totalword=len(self.wordlist) for x in self.wordlist: print self.worddic[x] print "total word:", totalword for dbname in self.wordlist: if self.tempdb.has_key(dbname): urllist=urllist+self.Stringload(zlib.decompress(self.tempdb[dbname])) if totalword>1: ranklist=ranking.ranking(urllist) urlcomp=ranklist.dicuniques(totalword) else: urlcomp=urllist totalsize=len(urlcomp) ralist=[] pagestartat=self.page*self.pagesize if (pagestartat+self.pagesize)>totalsize: pageendat=totalsize else: pageendat=pagestartat+self.pagesize print time.time()-ST if totalword!=1: rangestsart=0 rangeend=totalsize if totalsize>500 or pagestartat>=500: rangestsart=(pagestartat//500)*500 rangeend=rangestsart+500 pagestartat=pagestartat-rangestsart pageendat=pagestartat+self.pagesize #for i in xrange(0,totalsize): count=0 searchtime=0.0 linktime=0.0 for i in xrange(rangestsart,rangeend): bastscore=0 mirs=0 spliturl=urlcomp[i] if totalword>=3: sword=3 else: sword=totalword if len(spliturl)==2 and spliturl[0]==totalword: at=time.time() self.pct,title=self.purei.queryPurecontent(spliturl[1]) bt=time.time()-at matchstart=0 scorelist=[] searchtime=searchtime+bt for match in re.finditer(self.uni.decode("utf-8"),self.pct): matchstart=match.start() if matchstart: bastscore=60 if (matchstart+150)> len(self.pct): mirs=len(self.pct)-matchstart scorelist.append((match.start(),match.end())) startat=self.findpunctuation(matchstart) scorelist=self.wordmarkup(scorelist,startat-mirs) abstract=startat-mirs destcontent=self.pct[abstract:abstract+150] url=urllib.quote(self.urldb[self.serialdb[spliturl[1]]]) ralist.append((destcontent,bastscore,str(spliturl[0]),url,title,scorelist)) if len(spliturl)==2 and spliturl[0]>=sword and bastscore==0: at=time.time() # self.pct,title=self.purei.queryPurecontent(spliturl[1]) r=[] for dbname in self.wordlist: if self.tempdb.has_key(dbname): picklelist=[] for match in re.finditer(self.worddic[dbname],self.pct): picklelist.append((match.start(),match.end())) r=r+picklelist r=sorted(r, key=operator.itemgetter(0)) r=ranklist.wordlinker(r) bastscore, scorelist = ranklist.counttheimportantpart(r) #print scorelist if len(scorelist)>0: startat=scorelist[0][0] startat=self.findpunctuation(startat) if (startat+150)> len(self.pct): mirs=len(self.pct)-startat scorelist=self.wordmarkup(scorelist,startat-mirs) abstract=startat-mirs destcontent=self.pct[abstract:abstract+150] url=urllib.quote(self.urldb[self.serialdb[spliturl[1]]]) ralist.append((destcontent,bastscore,str(spliturl[0]),url,title,scorelist)) bt=time.time()-at linktime=linktime+bt print 'totalword2:', print time.time()-ST if totalword==1: for i in xrange(pagestartat,pageendat): bastscore=0 mirs=0 spliturl=urlcomp[i] self.pct,title=self.purei.queryPurecontent(spliturl) matchstart=0 scorelist=[] picklelist=[] for match in re.finditer(self.uni,self.pct): matchstart=match.start() picklelist.append((match.start(),match.end())) if (matchstart+100) > len(self.pct): mirs=len(self.pct)-matchstart scorelist=picklelist startat=scorelist[0][0] startat=self.findpunctuation(startat) scorelist=self.wordmarkup(scorelist,startat-mirs) abstract=startat-mirs destcontent=self.pct[abstract:abstract+150] url=urllib.quote(self.urldb[self.serialdb[spliturl[0:4]]]) #print destcontent,str(1),url,title,scorelist ralist.append((destcontent,100,str(1),url,title,scorelist)) print 'totalword1:', print time.time()-ST return (totalsize,sorted(ralist, key=operator.itemgetter(1),reverse=True)) print "search:",str(searchtime) print "Link:",str(linktime) return (totalsize,sorted(ralist, key=operator.itemgetter(1),reverse=True)[pagestartat:pageendat])