Example #1
0
    def dbinit(self):
	dbtype = db.DB_HASH
	dbopenflags = db.DB_THREAD
	envflags = db.DB_THREAD | db.DB_INIT_MPOOL | db.DB_INIT_LOCK
	dbenv = db.DBEnv()
	dbenv.set_mp_mmapsize(50*1024*1024)
	dbenv.set_cachesize(0,100*1024*1024,0)
	homeDir='database/world'
	dbenv.open(homeDir, envflags | db.DB_CREATE)
	self.tempdb=db.DB(dbenv)
	self.tempdb.open('tempwdb.db', mode=0660, dbtype=db.DB_HASH, flags=dbopenflags|db.DB_CREATE)

        wdbenv = db.DBEnv()
        envflags= db.DB_THREAD | db.DB_INIT_MPOOL | db.DB_INIT_LOCK
        wdbenv.set_mp_mmapsize(100*1024*1024)
        wdbenv.set_cachesize(0,200*1024*1024,0)
        homeDir='database/wpage/'
        wdbenv.open(homeDir, envflags|db.DB_CREATE)
        self.wpagedb=db.DB(wdbenv)
        self.wpagedb.open('wpagedb.db', None, db.DB_HASH, db.DB_DIRTY_READ)

	self.purei=Purecontent('r')
	self.serialdb=db.DB()
	self.serialdb.open('database/pureserial.db', None, db.DB_BTREE, db.DB_DIRTY_READ)
	self.urldb=db.DB()
	self.urldb.open('dic/anurl.db', None, db.DB_HASH, db.DB_DIRTY_READ)
Example #2
0
 def __init__(self):
     self.uni = ""
     self.title = ""
     self.content = ""
     self.md5urllist = {}
     self.purei = Purecontent("c")
     self.urlinsert = DataInsert()
     self.urlinsert.urldbinit()
Example #3
0
def linesplitinster(md5urllist):
    purei = Purecontent("r")
    total = len(md5urllist)
    wordi = TextInsert()
    wsynccount = 0
    for md5url in md5urllist.keys():
        st = time.time()
        tail = []
        totaldic = 0
        totalcomp = 0
        pureserial = purei.queryserial(md5url)
        if purei.querycontentcount(pureserial):
            purecount = int(purei.querycontentcount(pureserial)) + 1
        else:
            purecount = 0
        for seri in xrange(purecount):
            querykey = pureserial + contentprocess.lintoascii(seri)
            while count_active(tail) >= config.splitercpu:
                time.sleep(0.5)
            getre = bngram.wordspliting(purei.querycontentinline(querykey), querykey)
            tail.append(getre)
            getre.start()  # execute getre.run()
        dba = DataInsert()
        dba.outdicdbinit()  # open the word database which are out of dic
        dba.companwordcount = 0
        wa = 0  # if we have to reload anuutf-8 dic
        for splitterlist in tail:
            splitterlist.join(config.splitertimeout)
            totalcomp = totalcomp + len(splitterlist.companword)
            totaldic = totaldic + len(splitterlist.dicword)
            dba.wordlist = splitterlist.companword
            if dba.wordlist:
                dba.anuworddb()
                wa = 1
        dba.outdicdbclose()
        if wa:
            wordi.anureload()
        # print dba.companwordcount,totalcomp,totaldic
        # wordi=TextInsert()
        for splitterlist in tail:
            if splitterlist.dicword:
                wordi.getdicdb = 1
                wordi.dicword = splitterlist.dicword
                wordi.tempwurl(splitterlist.querykey)
            if splitterlist.companword:
                wordi.getdicdb = 2
                wordi.dicword = splitterlist.companword
                wordi.tempwurl(splitterlist.querykey)
        tail = []
        # print time.time()-st
        wsynccount += 1
        if wsynccount > 8192:
            stderr.write("dbsync")
            wordi.sync_wpage()
            wsynccount = 0
            if reloadxmlrpcd():
                stderr.write("+")
        stderr.write(".")

    title, word = "", ""
    stderr.write("dbsync")
    wordi.sync_wpage()
    if reloadxmlrpcd():
        stderr.write("+")
    wordi.closedicdb()
    purei.close()
Example #4
0
class Contentprocess(object):
    def __init__(self):
        self.uni = ""
        self.title = ""
        self.content = ""
        self.md5urllist = {}
        self.purei = Purecontent("c")
        self.urlinsert = DataInsert()
        self.urlinsert.urldbinit()

    def closeandreturn(self):
        self.purei.close()
        self.urlinsert.urldbclose()
        return self.md5urllist

    def contentadd(self, largeinsert):
        for x in largeinsert.keys():
            self.uni = x
            cdata = largeinsert[x]
            self.title = cdata[0]
            self.content = cdata[1]
            self.contentinsert()

    def contentinsert(self):
        md5url = hashlib.md5(self.uni).hexdigest()
        self.purei.url_md5 = md5url
        self.md5urllist[md5url] = self.uni
        # url db
        self.urlinsert.url = self.uni
        self.urlinsert.md5url = md5url
        self.urlinsert.inserturldb()
        stmk = stopmarks()

        if self.purei.checkexist():
            self.purei.title = self.title.encode("utf-8")
            context = ""
            word = self.content
            n = 0
            for xw in word:
                if ord(xw) >= 32 or ord(xw) in [9, 10, 13]:
                    context = context + xw
                n += 1
                if n > 40000000:  # may over 65535 line of a document.
                    break
            context = context + chr(32)
            contline = []
            contline.append("")
            word = ""  # release word value
            i = 0  # line of contline list
            x = 0  # word number
            msl = 260
            while x < len(context):
                ordx = ord(context[x])
                contline[i] = contline[i] + context[x]
                sentencecount = len(clearspace((contline[i])))
                if (
                    sentencecount > msl
                    and stmk.atypestopmarks(ordx)
                    or sentencecount > msl
                    and context[x : x + 2] == ". "
                    or sentencecount > msl + 20
                    and stmk.btypestopmarks(ordx)
                    or sentencecount > msl + 20
                    and ordx == 10
                    and ord(context[x + 1 : x + 2]) < 65
                ):
                    nextword = context[x + 1 : x + 2]
                    if nextword:
                        if punctuationmarks(ord(nextword)):
                            # at some case, chinese word will use two marks.
                            x += 1
                            contline[i] = contline[i] + context[x]
                    contline.append("")
                    i = len(contline) - 1
                    if msl <= 16640 and i % 2:
                        msl = msl + msl  # Dobule it, Until this value bigger then 16640.
                x += 1
                if sentencecount < msl:
                    contline[i] = contline[i] + context[x : x + msl]
                    x = x + msl

            contcleanline = []
            i = 0  # i for contline
            for x in contline:
                cont = clearspace(x)
                if len(cont) > 1:
                    if cont[0] == chr(32) and cont[-1] == chr(32):
                        cont = cont[1:-1]
                    elif cont[-1] == chr(32):
                        cont = cont[:-1]
                    elif cont[0] == chr(32):
                        cont = cont[1:]
                if len(cont) < 65025 and cont != chr(32):
                    contcleanline.append(cont.encode("utf-8"))
                    i = i + 1
            self.purei.purecotentinline = contcleanline
            self.purei.content = clearspace(context).encode("utf-8")
            self.purei.insertPurecontent()
            stderr.write(".")
Example #5
0
def OriginalHTMLprocess(listsplit):
    OriginalHTMLdb = OriginalPage()
    ilog = infologger()
    purei = Purecontent("c")
    pat = re.compile("<([^>]|\n)*>")
    space = re.compile("\&nbsp\;|\&copy\;|\r|\t")
    stmk = stopmarks()
    md5urllist = {}
    for i in listsplit:
        md5url = md5hex(i)
        md5urllist[md5url] = [i]
        word = ""
        st = time.time()
        purei.url_md5 = md5url
        if purei.checkexist():
            OriginalHTMLdb.url = i
            parser = html2txt()
            try:
                parser.feed(OriginalHTMLdb.queryoriginalct())
                charset = parser.charset  # charset detector
                parser.close()
            except:
                charset = ""
            Originaltext = langconvert(OriginalHTMLdb.queryoriginalct(), charset)
            Originaltext = Originaltext.decode("utf-8")
            ilog.sentence_split_info(time.time() - st)
            try:  # If this page is normal html format
                parser = ""
                parser = html2txt()
                parser.feed(Originaltext)
                word = word + parser.text
                if len(word) == 0:
                    word = word + space.sub(chr(32), pat.sub(chr(32), Originaltext))
                contenttitle = clearspace(parser.title)
                parser.close()
                # print contenttitle,i,charset
                purei.title = contenttitle.encode("utf-8")
            except:
                try:
                    parser = html2txt()
                    parser.feed(Originaltext)
                    contenttitle = clearspace(parser.title)
                    parser.close()
                except:
                    contenttitle = ""
                purei.title = contenttitle.encode("utf-8")
                word = word + space.sub(chr(32), pat.sub(chr(32), Originaltext))

            context = ""
            ilog.sentence_split_info(time.time() - st)
            n = 0
            for xw in word:
                if ord(xw) >= 32 or ord(xw) in [9, 10, 13]:
                    context = context + xw
                n += 1
                if n > 40000000:  # may over 65535 line of a document.
                    break
            ilog.sentence_split_info(purei.title + str(len(context)) + i + charset)
            context = context + chr(32)
            contline = []
            contline.append("")
            i = 0  # line of contline list
            # for x in xrange(len(context)):
            x = 0  # word number
            msl = 260
            while x < len(context):
                ordx = ord(context[x])
                contline[i] = contline[i] + context[x]
                sentencecount = len(clearspace((contline[i])))
                # sentencecount=len(contline[i])
                if (
                    sentencecount > msl
                    and stmk.atypestopmarks(ordx)
                    or sentencecount > msl
                    and context[x : x + 2] == ". "
                    or sentencecount > msl + 20
                    and stmk.btypestopmarks(ordx)
                    or sentencecount > msl + 20
                    and ordx == 10
                    and ord(context[x + 1 : x + 2]) < 65
                ):
                    nextword = context[x + 1 : x + 2]
                    if nextword:
                        if punctuationmarks(ord(nextword)):
                            # at some case, chinese word will use two marks.
                            x += 1
                            contline[i] = contline[i] + context[x]
                    contline.append("")
                    i += 1
                    if msl <= 16640 and i % 2:
                        msl = msl + msl  # Dobule it, Until this value bigger then 4000.
                x += 1
                if sentencecount < msl:
                    contline[i] = contline[i] + context[x : x + msl]
                    x = x + msl

            contcleanline = []
            i = 0
            ilog.sentence_split_info(time.time() - st)
            for x in contline:
                cont = clearspace(x)
                if len(cont) > 1:
                    if cont[0] == chr(32) and cont[-1] == chr(32):
                        cont = cont[1:-1]
                    elif cont[-1] == chr(32):
                        cont = cont[:-1]
                    elif cont[0] == chr(32):
                        cont = cont[1:]
                if len(cont) < 65025 and cont != chr(32):
                    contcleanline.append(cont.encode("utf-8"))
                    i = i + 1
            ilog.sentence_split_info(time.time() - st)
            purei.purecotentinline = contcleanline
            purei.content = clearspace(context).encode("utf-8")
            purei.insertPurecontent()
            stderr.write(".")
    OriginalHTMLdb.close()
    purei.close()
    return md5urllist
Example #6
0
class QueryObj(object):

    def __init__(self):
        self.uni=''
	self.pct=''
	self.page=0
	self.pagesize=10
	self.worddic={}
	self.wordlist=[]
	self.dq=DicQuery()

    def udngram(self):
	tail=[]
	getre=bngram.wordspliting(self.uni,"query")
	getre.start()
	getre.join()
	tail.append(getre)
	self.worddic, self.wordlist = self.dq.tailobject(tail)

    def dbinit(self):
	dbtype = db.DB_HASH
	dbopenflags = db.DB_THREAD
	envflags = db.DB_THREAD | db.DB_INIT_MPOOL | db.DB_INIT_LOCK
	dbenv = db.DBEnv()
	dbenv.set_mp_mmapsize(50*1024*1024)
	dbenv.set_cachesize(0,100*1024*1024,0)
	homeDir='database/world'
	dbenv.open(homeDir, envflags | db.DB_CREATE)
	self.tempdb=db.DB(dbenv)
	self.tempdb.open('tempwdb.db', mode=0660, dbtype=db.DB_HASH, flags=dbopenflags|db.DB_CREATE)

        wdbenv = db.DBEnv()
        envflags= db.DB_THREAD | db.DB_INIT_MPOOL | db.DB_INIT_LOCK
        wdbenv.set_mp_mmapsize(100*1024*1024)
        wdbenv.set_cachesize(0,200*1024*1024,0)
        homeDir='database/wpage/'
        wdbenv.open(homeDir, envflags|db.DB_CREATE)
        self.wpagedb=db.DB(wdbenv)
        self.wpagedb.open('wpagedb.db', None, db.DB_HASH, db.DB_DIRTY_READ)

	self.purei=Purecontent('r')
	self.serialdb=db.DB()
	self.serialdb.open('database/pureserial.db', None, db.DB_BTREE, db.DB_DIRTY_READ)
	self.urldb=db.DB()
	self.urldb.open('dic/anurl.db', None, db.DB_HASH, db.DB_DIRTY_READ)

    def dbclose(self):
	self.dq.dicclose()
	self.purei.close()
	self.urldb.close()
	self.tempdb.close()
	self.wpagedb.close()

    def wordmarkup(self,scorelist,startat):
        tempscorelist=[]
	at=scorelist[0][0]-startat
	for x in scorelist:
		start,end=x
		start=at+start-scorelist[0][0]
		end=at+end-scorelist[0][0]
		tempscorelist.append((start,end))
	return tempscorelist

    def uniques(self,wlist):
        u=[]
        for l in wlist:
                if l not in u and len(l)>0:
                        u.append(l)

        return u

    def dpickdump(self,picklist):
    	dpickstring=""
	for x in picklist:
	    if len(str(x[0]))<9:
	    	dpickstring=dpickstring+str(len(str(x[0])))+str(x[0])
	return dpickstring

    def dpickload(self,pick,wordsize):
        i=0
	b=[]
	while i<len(pick):
	    start=int(pick[1+i:1+i+int(pick[i])])
	    end=start+wordsize
	    b.append((start,end))
	    i=i+1+int(pick[i])
	return b

    def Stringload(self,UrlString):
        URLList={}
        for x in xrange(0,(len(UrlString)//6)):
	    URLList[UrlString[x*6:(1+x)*6]]=''
        return len(URLList),URLList

    def findpunctuation(self, pct, startat):
	until=0
	if startat > 20:
		until=20
	elif startat > 1:
		until=startat
	else:
		pass
	if until:
            for punctuationx in xrange(1,until):
	        if punctuationmarks(ord(pct[startat-punctuationx])):
	            startat=startat-punctuationx+1
	            break
	return startat

    def query(self):
	ST=time.time()
	conlinklist={}
	concountlist={}
	self.wordlist=self.uniques(self.wordlist)
	totalword=len(self.wordlist)
	for dbname in self.wordlist:
	    print self.worddic[dbname],
	    if self.tempdb.has_key(dbname):
	        concountlist[dbname],conlinklist[dbname]=self.Stringload(zlib.decompress(self.tempdb[dbname]))
	print "total word:", totalword
	# find the smallest size of conlinklist.
	mixSL=(0,0)
	for listcount in concountlist.keys():
	    if concountlist[listcount]>mixSL[1] or mixSL==(0,0):
	    	mixSL=(listcount,concountlist[listcount])
	# marge all of the key list.
	basiclist=conlinklist[mixSL[0]]
	for mx in conlinklist.keys():
	   templist={}
	   if mx != mixSL[0]:
	   	for iclb in conlinklist[mx].keys():
		     if basiclist.has_key(iclb):
		     	templist[iclb]=''
		basiclist=templist
	# The match key of every conlinklist is basiclist
	querydic={}
	
	tbs=len(basiclist)
	print 'est size',tbs,time.time()-ST
	brack=int(tbs//10000)
	if brack>0:
	    partsize=tbs//brack
	    lastpartsize=tbs%brack
	    # self.page*self.pagesize
	    matst=(self.page%brack)*partsize
	    if matst==0:
	        rlist=basiclist.keys()[matst:partsize+lastpartsize]
	    else:
	    	rlist=basiclist.keys()[matst+lastpartsize:matst+partsize]

	    
	    pagestartat=int(self.page//brack)*self.pagesize
	    #if (pagestartat+self.pagesize)>totalsize:
	    #	pageendat=totalsize
	    #else:
	    pageendat=pagestartat+self.pagesize
	else:
		rlist=basiclist.keys()

	RTS=0.0
	PT=time.time()
	tplist=''
	tpindex={}
	for wi in self.wordlist:
	    tplist=zlib.decompress(self.wpagedb.get(wi))
	    idx=0
	    if tplist:
	        while idx<len(tplist):
		    querykey=tplist[idx:idx+6] # querykey
		    tw=ord(tplist[idx+6]) # wordcount
		    nextidx=idx+7+3*(tw+1)
		    wposition=tplist[idx+7:nextidx]
		    idx=nextidx
		    tpindex[wi+querykey]=wposition
		    #print len(wi+querykey),tw,len(wposition)
	print time.time()-ST	
	for x in rlist:
	    #ptl,title=self.purei.querycontent(x)
	    if not querydic.has_key(x[0:4]):
	    	querydic[x[0:4]]={}
	    posilist=[]
	    ase=0
	    PT=time.time()
	    for pw in self.wordlist:
	    	# total size will be 6 bytes querykey + 1 bytes wordcount + wordcount * 3 bytes position.
		position=tpindex[pw+x]
		totalpositionsize=len(position)//3
		prseek=0
		for pr in xrange(totalpositionsize):
		    pse=position[prseek:prseek+3]
		    start=(ord(pse[0])<<8)+ord(pse[1])
		    end=start+ord(pse[2])
		    posilist.append((start,end))
		    prseek+=3
	    RTS=RTS+(time.time()-PT)
	    posilist.sort()
	    #if x[4:6]==chr(0)+chr(0):
	    #	print posilist
	    querydic[x[0:4]][x[4:6]]=posilist
	    #print posilist
	print RTS
	print len(querydic),time.time()-ST
	scorelist={}
	for senlist in querydic.keys():
	    #print contentprocess.asciitoint(senlist),':'
	    for ralist in querydic[senlist].keys():
	        #print chr(32)*4,contentprocess.asciitolin(ralist),'->',
		rx=querydic[senlist][ralist][0]
		x1,y1=rx[0],rx[1]
		#print rx[0],rx[1],
		i,c=1,0 # i and c value are working for the wordlink
		score=0
		x2,y2=0,0
		a=[]
		for rx in querydic[senlist][ralist][1:]:
		    x2,y2=rx[0],rx[1]
		    if y1==x2 or (y1-1)==x2:
		    	x1,y1=(x1,y2)
			i,c=1,1
		    elif i==1 and c==0:
		    	i=0
			a.append((x1,y1))
			score=score+(y1-x1-2)
			x1,y1=(x2,y2)
			#pse=rx[2]
		    else:
			a.append((x1,y1))
			score=score+(y1-x1-2)
			x1,y1=(x2,y2)
			pse=''
		    c=0
		if i==1:
		    a.append((x1,y1))
		    score=score+(y1-x1-2)
		if a[-1][1]!=y2 and y2!=0:
		    a.append((x2,y2))
		if a:
		    querydic[senlist][ralist]=a
		scorelist[senlist+ralist]=score
	# choucing the bast part of dest
	print "Scorelist:",time.time()-ST
	bastlist=[]
	subjectdic={}
	for senlist in querydic.keys():
	    bscore=0
	    addscore=0
	    tl=''
	    for ralist in querydic[senlist].keys():
		if ralist == chr(0)+chr(0):
		    subjectdic[senlist]=querydic[senlist][ralist]
	        elif scorelist.has_key(senlist+chr(0)+chr(0)):
		    score=scorelist[senlist+chr(0)+chr(0)]+scorelist[senlist+ralist]
		    if score>bscore or bscore==0:
			bscore=score+10
		        tl=ralist
		else:
		    score=scorelist[senlist+ralist]
		    if score>bscore or bscore==0:
		        bscore=score
		        tl=ralist
	    if len(tl):
	        bastlist.append((senlist+tl,bscore))
	ralist=[]
	print "bastlist:",time.time()-ST
	scoredlist=sorted(bastlist, key=operator.itemgetter(1),reverse=True)
        totalsize=len(scoredlist)
	
	if brack==0:
            pagestartat=self.page*self.pagesize
            if (pagestartat+self.pagesize)>totalsize:
                pageendat=totalsize
            else:
                pageendat=pagestartat+self.pagesize
	else:
		totalsize=tbs

	for ikey,score in scoredlist[pagestartat:pageendat]:
	    ptl,title=self.purei.querycontent(ikey) 
	    qstart=querydic[ikey[0:4]][ikey[4:6]][0][0]
	    startat=self.findpunctuation(ptl,qstart)
	    backoff=startat-qstart
	    bdest=0
	    if len(ptl)-startat<10:
	    	bdest=(startat+10-len(ptl))
	    startat=(startat-bdest)
	    #backoff=(backoff-bdest)
	    #print qstart,len(ptl),startat,backoff
	    positionlist=[]
	    for position in querydic[ikey[0:4]][ikey[4:6]]:
		if position[0]-startat>100:
		    break
		else:
		    positionlist.append((position[0]-startat,position[1]-startat))
		    #print position[0]-startat,position[1]-startat
	    url=urllib.quote(self.urldb[self.serialdb[ikey[0:4]]])
	    if subjectdic.has_key(ikey[0:4]):
	    	titleposition=subjectdic[ikey[0:4]]
	    else:
		titleposition=''
	    ralist.append((ptl[startat:startat+100],score,'',url,title,positionlist,titleposition))
	print time.time()-ST
	return (totalsize,ralist)

    def contentquery(self): 
        ST=time.time()
	urllist=[]
        totalword=len(self.wordlist)
	for x in self.wordlist:
		print self.worddic[x]
        print "total word:", totalword
	for dbname in self.wordlist:
	    if self.tempdb.has_key(dbname):
	       urllist=urllist+self.Stringload(zlib.decompress(self.tempdb[dbname]))
        if totalword>1:
	    ranklist=ranking.ranking(urllist)
	    urlcomp=ranklist.dicuniques(totalword)
        else:
	    urlcomp=urllist
	totalsize=len(urlcomp)
	ralist=[]
	pagestartat=self.page*self.pagesize
	if (pagestartat+self.pagesize)>totalsize:
		pageendat=totalsize
	else:
		pageendat=pagestartat+self.pagesize
	
	print time.time()-ST

        if totalword!=1:
	    rangestsart=0
	    rangeend=totalsize
	    if totalsize>500 or pagestartat>=500:
	    	rangestsart=(pagestartat//500)*500
		rangeend=rangestsart+500
		pagestartat=pagestartat-rangestsart
		pageendat=pagestartat+self.pagesize
	    #for i in xrange(0,totalsize):
	    count=0
	    searchtime=0.0
	    linktime=0.0
	    for i in xrange(rangestsart,rangeend):
		bastscore=0
	        mirs=0
	        spliturl=urlcomp[i]
		if totalword>=3:
			sword=3
		else:
			sword=totalword

		if len(spliturl)==2 and spliturl[0]==totalword:
		    at=time.time()
		    self.pct,title=self.purei.queryPurecontent(spliturl[1])
		    bt=time.time()-at
		    matchstart=0
		    scorelist=[]

		    searchtime=searchtime+bt
		    for match in re.finditer(self.uni.decode("utf-8"),self.pct):
			matchstart=match.start()
		    if matchstart:
			bastscore=60
			if (matchstart+150)> len(self.pct):
				mirs=len(self.pct)-matchstart
			scorelist.append((match.start(),match.end()))
			startat=self.findpunctuation(matchstart)
			scorelist=self.wordmarkup(scorelist,startat-mirs)
			abstract=startat-mirs
			destcontent=self.pct[abstract:abstract+150]
			url=urllib.quote(self.urldb[self.serialdb[spliturl[1]]])
			ralist.append((destcontent,bastscore,str(spliturl[0]),url,title,scorelist))

		if len(spliturl)==2 and spliturl[0]>=sword and bastscore==0:
		    at=time.time()
		    # self.pct,title=self.purei.queryPurecontent(spliturl[1])
		    r=[]
		    for dbname in self.wordlist:
			if self.tempdb.has_key(dbname):
				    picklelist=[]
				    for match in re.finditer(self.worddic[dbname],self.pct):
					picklelist.append((match.start(),match.end()))
				    r=r+picklelist
		    r=sorted(r, key=operator.itemgetter(0))
		    r=ranklist.wordlinker(r)
		    bastscore, scorelist = ranklist.counttheimportantpart(r)
		    #print scorelist
		    if len(scorelist)>0:
		        startat=scorelist[0][0]
		        startat=self.findpunctuation(startat)
		        if (startat+150)> len(self.pct):
		            mirs=len(self.pct)-startat
		        scorelist=self.wordmarkup(scorelist,startat-mirs)
		        abstract=startat-mirs
		        destcontent=self.pct[abstract:abstract+150]
		        url=urllib.quote(self.urldb[self.serialdb[spliturl[1]]])
		        ralist.append((destcontent,bastscore,str(spliturl[0]),url,title,scorelist))
		    bt=time.time()-at
		    linktime=linktime+bt

	print 'totalword2:',
	print time.time()-ST

        if totalword==1:
	    for i in xrange(pagestartat,pageendat):
                bastscore=0
                mirs=0
                spliturl=urlcomp[i]
		self.pct,title=self.purei.queryPurecontent(spliturl)
		matchstart=0
		scorelist=[]
		picklelist=[]
		for match in re.finditer(self.uni,self.pct):
                    matchstart=match.start()
		    picklelist.append((match.start(),match.end()))
                if (matchstart+100) > len(self.pct):
                    mirs=len(self.pct)-matchstart
		scorelist=picklelist
		startat=scorelist[0][0]
		startat=self.findpunctuation(startat)
		scorelist=self.wordmarkup(scorelist,startat-mirs)
		abstract=startat-mirs
		destcontent=self.pct[abstract:abstract+150]
		url=urllib.quote(self.urldb[self.serialdb[spliturl[0:4]]])
		#print destcontent,str(1),url,title,scorelist
		ralist.append((destcontent,100,str(1),url,title,scorelist))
	    print 'totalword1:',
	    print time.time()-ST
	    return (totalsize,sorted(ralist, key=operator.itemgetter(1),reverse=True))
	print "search:",str(searchtime)
	print "Link:",str(linktime)
	return (totalsize,sorted(ralist, key=operator.itemgetter(1),reverse=True)[pagestartat:pageendat])