console = logging.StreamHandler() console.setLevel(logging.WARNING) console.setFormatter(formatter) logging.getLogger('').addHandler(console) ###--------------main-----------------### driver = webdriver.Chrome() # driver = webdriver.Remote("http:localhost:4444/wd/hub", webdriver.DesiredCapabilities.CHROME.copy()) filterPath = 'sci.bloom_filter' bf = BloomFilter.open(filterPath) if isfile(filterPath) else BloomFilter(1000000, 0.001, filterPath) logging.info('bloom filter loaded') #将paper信息保存在paperInfo对象中 paperInfo = PaperInfo() #status用于记录当前状态 status = Status() statusPath = 'sci.status' if isfile(statusPath) : status = pkl.load(open(statusPath,'r')) logging.info('status loaded') logging.warning('current status: ' + status.__str__()) #定义是否倒序爬取 reverse = True index_range = [] if not reverse : index_range = range(status.query_index , len(querywords)) else: if status.query_index == 0: status.query_index=len(querywords)-1 index_range = range(status.query_index , -1 ,-1) #begin crawler
formatter = logging.Formatter('%(asctime)s, %(filename)s:%(lineno)d, %(levelname)s: %(message)s') console = logging.StreamHandler() console.setLevel(logging.WARNING) console.setFormatter(formatter) logging.getLogger('').addHandler(console) ###--------------main-----------------### driver = webdriver.Chrome() filterPath = 'sci.bloom_filter' bf = BloomFilter.open(filterPath) if isfile(filterPath) else BloomFilter(1000000, 0.001, filterPath) logging.info('bloom filter loaded') #将paper信息保存在paperInfo对象中 paperInfo = PaperInfo() #status用于记录当前状态 status = Status() statusPath = 'sci.status' if isfile(statusPath) : status = pkl.load(open(statusPath,'r')) logging.info('status loaded') try: #从当前query_index位置开始 for i in range(status.query_index , len(querywords)): #初始化这次query的状态 status.reset() query = querywords[i] status.query_index = i ; status.query = query #count 用于记录每个query爬取的论文数量,每个query最多爬取100篇 count = 0 logging.info('current query:'+'index = '+ str(i) + 'keyword = '+query) driver.get('http://apps.webofknowledge.com/')