import sys sys.path.append('..') sys.path.append('../common') from common.logger import log import getdoc from database import table,tablemerge,dbconfig from aggregate.rmduplicate import Depository import time import logging from common import toolpit from config import merge1_rmd_file oldtime=time.time() # depos=Depository(0.8,merge1_rmd_file) # for debug depos=Depository(0.8) docs=getdoc.get_records_dayago(dbconfig.mergetable,20) for doc,summary in docs.iteritems(): depos.add_doc(doc, summary) # just add doc into the repository msg='Depository summary builds,time cost: %.2f (s)' % (time.time()-oldtime,) print msg logging.info(msg) def __addDoctoTable(doc): rows=table.getRecordsById(doc.source, doc.uid) if rows==-1: print '%s getRecordsById error'%(doc.source) return if len(rows[0])>0: exrecord=list(rows[0]) mtype,click='',0 exrecord+=[mtype,click]
# remove from the inverse index if doc in self.forindex: _summary=self.forindex[doc] for word in _summary: if word in self.invindex and doc in self.invindex[word]: self.invindex[word].remove(doc) # remove from the forward index self.forindex.pop(doc) def remove_doc_before(self,ctime): rms=[] for doc in self.forindex: if doc.ctime<ctime: rms.append(doc) for doc in rms: self.__remove_doc(doc) msg='remove:%s,left:%s'%(len(rms),len(self.forindex)) print msg logging.info(msg) if __name__=='__main__': depos=Depository(0.8) for tablename in dbconfig.tableName.itervalues(): docs=getdoc.get_records_dayago(tablename,30) for doc,summary in docs.iteritems(): isnew,exist_doc=depos.add_doc(doc, summary) if isnew: pass # print exist_doc.uid,exist_doc.source,'-->',doc.uid,doc.source print 'time costs:%.2f (s)'%(time.time()-oldtime,)