Beispiel #1
0
def getDuplicationRate():
    count=0
    for tablename in dbconfig.tableName.itervalues():
        count+=table.getAllCount(tablename)
    mcount=tablemerge.getAllCount(dbconfig.mergetable)
    m2count=tablemerge2.getAllCount(dbconfig.mergetable2)    
    print 'First Duplication Rate: %.4f (%d/%d)'%(float(count-mcount)/count,count-mcount,count)    
    print 'Second Duplication Rate: %.4f (%d/%d)'%(float(mcount-m2count)/mcount,mcount-m2count,mcount)
Beispiel #2
0
def main():    
    infoList=[] 
    oldtime=time.time()
    for page in range(1,3):
        infoList+=getPageInfo(page)
            
    for info in infoList:
        try:
            table.InsertItemDict(ctable, info)
#             print info['loadtime'],info['title']
        except:
            logging.error('encoding not supported')
    print 'sohu has crawled %s records,time cost: %s (seconds)' % (len(infoList), time.time()-oldtime) 
    print 'database has',table.getAllCount(ctable)
Beispiel #3
0
def main():
    infoList=[] 
    oldtime=time.time()
#     page can be started from 0 to 5 which represents different category
    for page in range(0,6):
        infoList+=getPageInfo(page)
            
    for info in infoList:
        try:
#             table.InsertItemDict(ctable, info)
            print info['loadtime'],info['title']
        except:
            logging.error('encoding not supported')
    print 'qq has crawled %s records,time cost: %s (seconds)' % (len(infoList), time.time()-oldtime) 
    print 'database has',table.getAllCount(ctable)    
Beispiel #4
0
def main():
    infoList = []
    oldtime = time.time()
    for page in range(1, 2):
        for cat in categoryDict.iterkeys():
            infoList += getCatPageInfo(cat, page)

    for info in infoList:
        try:
            #             table.InsertItemDict(ctable, info)
            print info["loadtime"], info["title"]
        except:
            logging.error("encoding not supported")

    print "kankan has crawled %s records,time cost: %s (seconds)" % (len(infoList), time.time() - oldtime)
    print "database has", table.getAllCount(ctable)
Beispiel #5
0
def main():
    infoList=[]
    oldtime=time.time()
    infoList+=getFirstPageInfo() 
    infoList+=getHiddenPageInfo()
    infoList+=getExtraPageInfo(40)     
    for info in infoList:
        try:
#             table.InsertItemDict(ctable, info)
            print info['loadtime'],info['title']
        except:
            logging.error('encoding not supported')
    msg='sina has crawled %s records,time cost: %s (seconds)' % (len(infoList), time.time()-oldtime) 
    print msg
    log.info(msg)
    print 'database has',table.getAllCount(ctable)
Beispiel #6
0
def main():
    infoList=[]      
#     infoList+=getFirstPageInfo()
#     infoList+=getPageRightInfo() 
#  The two functions above can not get exact loadtime
#     infoList+=getExtraPageInfo(21)   # 21 is normal
    oldtime=time.time()
    infoList+=getExtraPageInfo(105)  
    for info in infoList:
        try:
            table.InsertItemDict(ctable, info)
#             print info['loadtime'],info['title']
        except:
            logging.error('encoding not supported')
    msg='v1 has crawled %s records,time cost: %s (seconds)' % (len(infoList), time.time()-oldtime)
    print msg
    log.info(msg) 
    print 'database has',table.getAllCount(ctable)