#status用于记录当前状态 status = Status() statusPath = 'sci.status' if isfile(statusPath) : status = pkl.load(open(statusPath,'r')) logging.info('status loaded') logging.warning('current status: ' + status.__str__()) #定义是否倒序爬取 reverse = True index_range = [] if not reverse : index_range = range(status.query_index , len(querywords)) else: if status.query_index == 0: status.query_index=len(querywords)-1 index_range = range(status.query_index , -1 ,-1) #begin crawler try: #从当前query_index位置开始 for i in index_range: #初始化这次query的状态 status.reset() query = querywords[i] status.query_index = i ; status.query = query #count 用于记录每个query爬取的论文数量,每个query最多爬取100篇 count = 0 logging.info('current query:'+'index = '+ str(i) + 'keyword = '+query) driver.get('http://apps.webofknowledge.com/') chooseEnglishLanguage(driver) submitForm(driver , query)
logging.info('bloom filter loaded') #将paper信息保存在paperInfo对象中 paperInfo = PaperInfo() #status用于记录当前状态 status = Status() statusPath = 'sci.status' if isfile(statusPath) : status = pkl.load(open(statusPath,'r')) logging.info('status loaded') try: #从当前query_index位置开始 for i in range(status.query_index , len(querywords)): #初始化这次query的状态 status.reset() query = querywords[i] status.query_index = i ; status.query = query #count 用于记录每个query爬取的论文数量,每个query最多爬取100篇 count = 0 logging.info('current query:'+'index = '+ str(i) + 'keyword = '+query) driver.get('http://apps.webofknowledge.com/') chooseEnglishLanguage(driver) submitForm(driver , query) #点击第一个链接,跳转到详情界面 resultCount = 0 try: resultCount = driver.find_element_by_id('hitCount.top').text.strip().replace(',', '') except NoSuchElementException , e: logging.warning('search found no records exception. query = ' + query) continue status.resultCount = resultCount logging.info('resultCount: '+ resultCount)