def insertPurecontent(self): if not self.pdb.has_key(self.url_md5) and len(self.content) > 1: self.serialdb[chr(0) * 4] = "0" # initial serial db. serialnumber = contentline.asciitoint(self.serialcursor.last()[0]) + 1 asciiserial = contentline.inttoascii(serialnumber) self.serialdb[asciiserial] = self.url_md5 # serialdb insert self.puresedb["%s" % self.url_md5] = asciiserial # insert serial to url_md5 compresscontent = zlib.compress(self.content, 9) self.pdb["%s" % asciiserial] = "%s" % (compresscontent) if not self.tdb.has_key(asciiserial) and len(self.title) > 1: self.tdb["%s" % asciiserial] = "%s" % (self.title) # insert purecontentcount and pureinline totallinesize = len(self.purecotentinline) self.purecontentcount["%s" % asciiserial] = "%s" % str(totallinesize) # 2 bytes serial line self.pureinline["%s" % asciiserial + contentline.lintoascii(0)] = self.title for x in xrange(totallinesize): serialkey = asciiserial + contentline.lintoascii(x + 1) self.pureinline["%s" % serialkey] = self.purecotentinline[x]
def linesplitinster(md5urllist): purei = Purecontent("r") total = len(md5urllist) wordi = TextInsert() wsynccount = 0 for md5url in md5urllist.keys(): st = time.time() tail = [] totaldic = 0 totalcomp = 0 pureserial = purei.queryserial(md5url) if purei.querycontentcount(pureserial): purecount = int(purei.querycontentcount(pureserial)) + 1 else: purecount = 0 for seri in xrange(purecount): querykey = pureserial + contentprocess.lintoascii(seri) while count_active(tail) >= config.splitercpu: time.sleep(0.5) getre = bngram.wordspliting(purei.querycontentinline(querykey), querykey) tail.append(getre) getre.start() # execute getre.run() dba = DataInsert() dba.outdicdbinit() # open the word database which are out of dic dba.companwordcount = 0 wa = 0 # if we have to reload anuutf-8 dic for splitterlist in tail: splitterlist.join(config.splitertimeout) totalcomp = totalcomp + len(splitterlist.companword) totaldic = totaldic + len(splitterlist.dicword) dba.wordlist = splitterlist.companword if dba.wordlist: dba.anuworddb() wa = 1 dba.outdicdbclose() if wa: wordi.anureload() # print dba.companwordcount,totalcomp,totaldic # wordi=TextInsert() for splitterlist in tail: if splitterlist.dicword: wordi.getdicdb = 1 wordi.dicword = splitterlist.dicword wordi.tempwurl(splitterlist.querykey) if splitterlist.companword: wordi.getdicdb = 2 wordi.dicword = splitterlist.companword wordi.tempwurl(splitterlist.querykey) tail = [] # print time.time()-st wsynccount += 1 if wsynccount > 8192: stderr.write("dbsync") wordi.sync_wpage() wsynccount = 0 if reloadxmlrpcd(): stderr.write("+") stderr.write(".") title, word = "", "" stderr.write("dbsync") wordi.sync_wpage() if reloadxmlrpcd(): stderr.write("+") wordi.closedicdb() purei.close()
def x3tracp(self, x3t): a = contentline.lintoascii(x3t[0]) b = chr(x3t[1] - x3t[0]) return a + b