def urldbinsert(listsplit): urlinsert = DataInsert() urlinsert.urldbinit() # open the url database group for i in listsplit: md5url = hashlib.md5(i).hexdigest() urlinsert.url = i urlinsert.md5url = md5url urlinsert.inserturldb() urlinsert.urldbclose()
def __init__(self): self.uni = "" self.title = "" self.content = "" self.md5urllist = {} self.purei = Purecontent("c") self.urlinsert = DataInsert() self.urlinsert.urldbinit()
def linesplitinster(md5urllist): purei = Purecontent("r") total = len(md5urllist) wordi = TextInsert() wsynccount = 0 for md5url in md5urllist.keys(): st = time.time() tail = [] totaldic = 0 totalcomp = 0 pureserial = purei.queryserial(md5url) if purei.querycontentcount(pureserial): purecount = int(purei.querycontentcount(pureserial)) + 1 else: purecount = 0 for seri in xrange(purecount): querykey = pureserial + contentprocess.lintoascii(seri) while count_active(tail) >= config.splitercpu: time.sleep(0.5) getre = bngram.wordspliting(purei.querycontentinline(querykey), querykey) tail.append(getre) getre.start() # execute dba = DataInsert() dba.outdicdbinit() # open the word database which are out of dic dba.companwordcount = 0 wa = 0 # if we have to reload anuutf-8 dic for splitterlist in tail: splitterlist.join(config.splitertimeout) totalcomp = totalcomp + len(splitterlist.companword) totaldic = totaldic + len(splitterlist.dicword) dba.wordlist = splitterlist.companword if dba.wordlist: dba.anuworddb() wa = 1 dba.outdicdbclose() if wa: wordi.anureload() # print dba.companwordcount,totalcomp,totaldic # wordi=TextInsert() for splitterlist in tail: if splitterlist.dicword: wordi.getdicdb = 1 wordi.dicword = splitterlist.dicword wordi.tempwurl(splitterlist.querykey) if splitterlist.companword: wordi.getdicdb = 2 wordi.dicword = splitterlist.companword wordi.tempwurl(splitterlist.querykey) tail = [] # print time.time()-st wsynccount += 1 if wsynccount > 8192: stderr.write("dbsync") wordi.sync_wpage() wsynccount = 0 if reloadxmlrpcd(): stderr.write("+") stderr.write(".") title, word = "", "" stderr.write("dbsync") wordi.sync_wpage() if reloadxmlrpcd(): stderr.write("+") wordi.closedicdb() purei.close()
class Contentprocess(object): def __init__(self): self.uni = "" self.title = "" self.content = "" self.md5urllist = {} self.purei = Purecontent("c") self.urlinsert = DataInsert() self.urlinsert.urldbinit() def closeandreturn(self): self.purei.close() self.urlinsert.urldbclose() return self.md5urllist def contentadd(self, largeinsert): for x in largeinsert.keys(): self.uni = x cdata = largeinsert[x] self.title = cdata[0] self.content = cdata[1] self.contentinsert() def contentinsert(self): md5url = hashlib.md5(self.uni).hexdigest() self.purei.url_md5 = md5url self.md5urllist[md5url] = self.uni # url db self.urlinsert.url = self.uni self.urlinsert.md5url = md5url self.urlinsert.inserturldb() stmk = stopmarks() if self.purei.checkexist(): self.purei.title = self.title.encode("utf-8") context = "" word = self.content n = 0 for xw in word: if ord(xw) >= 32 or ord(xw) in [9, 10, 13]: context = context + xw n += 1 if n > 40000000: # may over 65535 line of a document. break context = context + chr(32) contline = [] contline.append("") word = "" # release word value i = 0 # line of contline list x = 0 # word number msl = 260 while x < len(context): ordx = ord(context[x]) contline[i] = contline[i] + context[x] sentencecount = len(clearspace((contline[i]))) if ( sentencecount > msl and stmk.atypestopmarks(ordx) or sentencecount > msl and context[x : x + 2] == ". " or sentencecount > msl + 20 and stmk.btypestopmarks(ordx) or sentencecount > msl + 20 and ordx == 10 and ord(context[x + 1 : x + 2]) < 65 ): nextword = context[x + 1 : x + 2] if nextword: if punctuationmarks(ord(nextword)): # at some case, chinese word will use two marks. x += 1 contline[i] = contline[i] + context[x] contline.append("") i = len(contline) - 1 if msl <= 16640 and i % 2: msl = msl + msl # Dobule it, Until this value bigger then 16640. x += 1 if sentencecount < msl: contline[i] = contline[i] + context[x : x + msl] x = x + msl contcleanline = [] i = 0 # i for contline for x in contline: cont = clearspace(x) if len(cont) > 1: if cont[0] == chr(32) and cont[-1] == chr(32): cont = cont[1:-1] elif cont[-1] == chr(32): cont = cont[:-1] elif cont[0] == chr(32): cont = cont[1:] if len(cont) < 65025 and cont != chr(32): contcleanline.append(cont.encode("utf-8")) i = i + 1 self.purei.purecotentinline = contcleanline self.purei.content = clearspace(context).encode("utf-8") self.purei.insertPurecontent() stderr.write(".")