def getDuplicationRate(): count=0 for tablename in dbconfig.tableName.itervalues(): count+=table.getAllCount(tablename) mcount=tablemerge.getAllCount(dbconfig.mergetable) m2count=tablemerge2.getAllCount(dbconfig.mergetable2) print 'First Duplication Rate: %.4f (%d/%d)'%(float(count-mcount)/count,count-mcount,count) print 'Second Duplication Rate: %.4f (%d/%d)'%(float(mcount-m2count)/mcount,mcount-m2count,mcount)
def main(): infoList=[] oldtime=time.time() for page in range(1,3): infoList+=getPageInfo(page) for info in infoList: try: table.InsertItemDict(ctable, info) # print info['loadtime'],info['title'] except: logging.error('encoding not supported') print 'sohu has crawled %s records,time cost: %s (seconds)' % (len(infoList), time.time()-oldtime) print 'database has',table.getAllCount(ctable)
def main(): infoList=[] oldtime=time.time() # page can be started from 0 to 5 which represents different category for page in range(0,6): infoList+=getPageInfo(page) for info in infoList: try: # table.InsertItemDict(ctable, info) print info['loadtime'],info['title'] except: logging.error('encoding not supported') print 'qq has crawled %s records,time cost: %s (seconds)' % (len(infoList), time.time()-oldtime) print 'database has',table.getAllCount(ctable)
def main(): infoList = [] oldtime = time.time() for page in range(1, 2): for cat in categoryDict.iterkeys(): infoList += getCatPageInfo(cat, page) for info in infoList: try: # table.InsertItemDict(ctable, info) print info["loadtime"], info["title"] except: logging.error("encoding not supported") print "kankan has crawled %s records,time cost: %s (seconds)" % (len(infoList), time.time() - oldtime) print "database has", table.getAllCount(ctable)
def main(): infoList=[] oldtime=time.time() infoList+=getFirstPageInfo() infoList+=getHiddenPageInfo() infoList+=getExtraPageInfo(40) for info in infoList: try: # table.InsertItemDict(ctable, info) print info['loadtime'],info['title'] except: logging.error('encoding not supported') msg='sina has crawled %s records,time cost: %s (seconds)' % (len(infoList), time.time()-oldtime) print msg log.info(msg) print 'database has',table.getAllCount(ctable)
def main(): infoList=[] # infoList+=getFirstPageInfo() # infoList+=getPageRightInfo() # The two functions above can not get exact loadtime # infoList+=getExtraPageInfo(21) # 21 is normal oldtime=time.time() infoList+=getExtraPageInfo(105) for info in infoList: try: table.InsertItemDict(ctable, info) # print info['loadtime'],info['title'] except: logging.error('encoding not supported') msg='v1 has crawled %s records,time cost: %s (seconds)' % (len(infoList), time.time()-oldtime) print msg log.info(msg) print 'database has',table.getAllCount(ctable)