def main(option, *args): if option not in ['start', 'report', 'extract', 'recover']: print('Option not supported.') return if option == 'extract': if len(args) != 2: print('Usage: python WSJCrawler.py extract logfile outfile') else: extractlog(args[0], args[1]) return driver = Driver(SETTINGS) if option == 'start': inckey = loadkeywords('./data/inc.txt') url1, url2 = [], [] years = list(range(2005, 2015)) for each in iter(inckey): url1.extend(generateseeds(each, years)) url1.extend(generateseeds(each, [2015], [1, 2, 3, 4])) wordkey = loadkeywords('./data/word.txt') for each in iter(wordkey): url2.extend(generateseeds(each, years)) url2.extend(generateseeds(each, [2015], [1, 2, 3, 4])) driver.addtask('IncSpider', url1) driver.addtask('WordSpider', url2) driver.start() elif option == 'report': driver.report() elif option == 'recover': if len(args) != 2: print('Usage: python WSJCrawler.py recover spidername urlfile') else: driver.recover(args[0], args[1])
def test_start(self): d = Driver(DRIVER) d.getspider('Spider1').frontier.clean('todo', 'visited') d.getspider('Spider2').frontier.clean('todo', 'visited') urls = ['http://www.baidu.com', 'http://www.zhihu.com', 'http://www.renren.com'] d.addtask('Spider1', urls) d.addtask('Spider2', urls[0]) d.start() d.pause() time.sleep(1) d.resume() time.sleep(2) d.stop() d.getspider('Spider1').frontier.clean('todo', 'visited') d.getspider('Spider2').frontier.clean('todo', 'visited')