Beispiel #1
0
#status用于记录当前状态
status = Status()
statusPath = 'sci.status'
if isfile(statusPath) :
    status = pkl.load(open(statusPath,'r'))
    logging.info('status loaded')
    logging.warning('current status: ' + status.__str__())

#定义是否倒序爬取
reverse = True
index_range = []
if not reverse : 
    index_range = range(status.query_index , len(querywords))
else:
    if status.query_index == 0: status.query_index=len(querywords)-1
    index_range = range(status.query_index , -1  ,-1)
#begin crawler
try:
    #从当前query_index位置开始
    for i in index_range:
        #初始化这次query的状态
        status.reset()
        query = querywords[i]
        status.query_index = i ; status.query = query
        #count 用于记录每个query爬取的论文数量,每个query最多爬取100篇
        count = 0
        logging.info('current query:'+'index = '+ str(i) + 'keyword = '+query)
        driver.get('http://apps.webofknowledge.com/')
        chooseEnglishLanguage(driver)
        submitForm(driver , query)
Beispiel #2
0
logging.info('bloom filter loaded')
#将paper信息保存在paperInfo对象中
paperInfo = PaperInfo()
#status用于记录当前状态
status = Status()
statusPath = 'sci.status'
if isfile(statusPath) :
    status = pkl.load(open(statusPath,'r'))
    logging.info('status loaded')
try:
    #从当前query_index位置开始
    for i in range(status.query_index , len(querywords)):
        #初始化这次query的状态
        status.reset()
        query = querywords[i]
        status.query_index = i ; status.query = query
        #count 用于记录每个query爬取的论文数量,每个query最多爬取100篇
        count = 0
        logging.info('current query:'+'index = '+ str(i) + 'keyword = '+query)
        driver.get('http://apps.webofknowledge.com/')
        chooseEnglishLanguage(driver)
        submitForm(driver , query)
        #点击第一个链接,跳转到详情界面
        resultCount = 0
        try:
            resultCount = driver.find_element_by_id('hitCount.top').text.strip().replace(',', '')
        except NoSuchElementException , e:
            logging.warning('search found no records exception. query = ' + query)
            continue
        status.resultCount = resultCount
        logging.info('resultCount: '+ resultCount)