def udngram(self): tail=[] getre=bngram.wordspliting(self.uni,"query") getre.start() getre.join() tail.append(getre) self.worddic, self.wordlist = self.dq.tailobject(tail)
def bn_query(q,strin): dic=bngram.wordspliting(strin,'sd') dic.run() q.put((dic.dicword,dic.companword,dic.ascword,dic.gramword))
def linesplitinster(md5urllist): purei = Purecontent("r") total = len(md5urllist) wordi = TextInsert() wsynccount = 0 for md5url in md5urllist.keys(): st = time.time() tail = [] totaldic = 0 totalcomp = 0 pureserial = purei.queryserial(md5url) if purei.querycontentcount(pureserial): purecount = int(purei.querycontentcount(pureserial)) + 1 else: purecount = 0 for seri in xrange(purecount): querykey = pureserial + contentprocess.lintoascii(seri) while count_active(tail) >= config.splitercpu: time.sleep(0.5) getre = bngram.wordspliting(purei.querycontentinline(querykey), querykey) tail.append(getre) getre.start() # execute getre.run() dba = DataInsert() dba.outdicdbinit() # open the word database which are out of dic dba.companwordcount = 0 wa = 0 # if we have to reload anuutf-8 dic for splitterlist in tail: splitterlist.join(config.splitertimeout) totalcomp = totalcomp + len(splitterlist.companword) totaldic = totaldic + len(splitterlist.dicword) dba.wordlist = splitterlist.companword if dba.wordlist: dba.anuworddb() wa = 1 dba.outdicdbclose() if wa: wordi.anureload() # print dba.companwordcount,totalcomp,totaldic # wordi=TextInsert() for splitterlist in tail: if splitterlist.dicword: wordi.getdicdb = 1 wordi.dicword = splitterlist.dicword wordi.tempwurl(splitterlist.querykey) if splitterlist.companword: wordi.getdicdb = 2 wordi.dicword = splitterlist.companword wordi.tempwurl(splitterlist.querykey) tail = [] # print time.time()-st wsynccount += 1 if wsynccount > 8192: stderr.write("dbsync") wordi.sync_wpage() wsynccount = 0 if reloadxmlrpcd(): stderr.write("+") stderr.write(".") title, word = "", "" stderr.write("dbsync") wordi.sync_wpage() if reloadxmlrpcd(): stderr.write("+") wordi.closedicdb() purei.close()