コード例 #1
0
ファイル: sci_clawer.py プロジェクト: irgb/SCICrawler
console = logging.StreamHandler()
console.setLevel(logging.WARNING)
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)
###--------------main-----------------###

driver = webdriver.Chrome()
# driver = webdriver.Remote("http:localhost:4444/wd/hub", webdriver.DesiredCapabilities.CHROME.copy())
filterPath = 'sci.bloom_filter'
bf = BloomFilter.open(filterPath) if isfile(filterPath) else BloomFilter(1000000, 0.001, filterPath)
logging.info('bloom filter loaded')
#将paper信息保存在paperInfo对象中
paperInfo = PaperInfo()

#status用于记录当前状态
status = Status()
statusPath = 'sci.status'
if isfile(statusPath) :
    status = pkl.load(open(statusPath,'r'))
    logging.info('status loaded')
    logging.warning('current status: ' + status.__str__())

#定义是否倒序爬取
reverse = True
index_range = []
if not reverse : 
    index_range = range(status.query_index , len(querywords))
else:
    if status.query_index == 0: status.query_index=len(querywords)-1
    index_range = range(status.query_index , -1  ,-1)
#begin crawler
コード例 #2
0
ファイル: sci_clawer.py プロジェクト: waleking/SCICrawler
formatter = logging.Formatter('%(asctime)s, %(filename)s:%(lineno)d, %(levelname)s: %(message)s')
console = logging.StreamHandler()
console.setLevel(logging.WARNING)
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)
###--------------main-----------------###

driver = webdriver.Chrome()

filterPath = 'sci.bloom_filter'
bf = BloomFilter.open(filterPath) if isfile(filterPath) else BloomFilter(1000000, 0.001, filterPath)
logging.info('bloom filter loaded')
#将paper信息保存在paperInfo对象中
paperInfo = PaperInfo()
#status用于记录当前状态
status = Status()
statusPath = 'sci.status'
if isfile(statusPath) :
    status = pkl.load(open(statusPath,'r'))
    logging.info('status loaded')
try:
    #从当前query_index位置开始
    for i in range(status.query_index , len(querywords)):
        #初始化这次query的状态
        status.reset()
        query = querywords[i]
        status.query_index = i ; status.query = query
        #count 用于记录每个query爬取的论文数量,每个query最多爬取100篇
        count = 0
        logging.info('current query:'+'index = '+ str(i) + 'keyword = '+query)
        driver.get('http://apps.webofknowledge.com/')