class ScraperEngine(object): def __init__(self, plugin_dir, filename_regexp=None): self.plugin_dir = plugin_dir self.plugin_loader = PluginLoader(filename_regexp) def load_plugins(self): self.plugin_loader.load_plugins([self.plugin_dir]) def run(self): for plug in self.plugin_loader.plugins: scraper = plug.get_scraper() scraper.scrape()
class BidScraperEngine(TaskManager): def __init__(self, plugin_dir, results_dir, logfile='/var/log/bidmap/bid_scraper_engine.log'): self.plugin_dir = plugin_dir self.results_dir = results_dir self.logfile = logfile self.plugins = [] self.pldr = PluginLoader() self.logger = logging.getLogger('bidmap') self.logger.setLevel(logging.DEBUG) fh = logging.FileHandler(filename=logfile, mode='w') fh.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) self.logger.addHandler(fh) super(BidScraperEngine, self).__init__() def load_plugins(self): self.pldr.load_plugins([self.plugin_dir]) random.shuffle(self.pldr.plugins) tasks = [ PluginRunner(plugin=p, results_dir=self.results_dir, logfile=self.logfile) for p in self.pldr.plugins ] self.set_task_list(tasks=tasks) self.logger.info('loaded %d plugins' % len(self.pldr.plugins)) def task_launched(self, task, pid): self.logger.info('Task %d for plugin %s launched' % (int(pid), task.plugin)) def task_completed(self, task, pid): self.logger.info('Task %d for plugin %s completed' % (int(pid), task.plugin))
class ScraperEngine(object): def __init__(self, plugin_dir, results_dir='./results/', filename_regexp=None, max_active=5, max_run_time=None): self.num_active = 0 self.max_active = max_active self.max_run_time = max_run_time self.plugin_loader = PluginLoader(filename_regexp) self.plugin_loader.load_plugins([plugin_dir]) self.process_list = [ PluginProcess(plugin=p, result_dir=results_dir) for p in self.plugin_loader.plugins ] def launch_max_active(self): ''' Launch up to max_active processes to run plugins ''' self.num_active = len(active_children()) while len(self.process_list) > 0 and self.num_active < self.max_active: process = self.process_list.pop(0) process.start_time = datetime.now() process.start() self.num_active = len(active_children()) def run(self): while len(self.process_list) > 0 or self.num_active > 0: self.launch_max_active() # # Monitor for hung processes # if self.max_run_time: for p in active_children(): run_time = datetime.now() - p.start_time if p.is_alive() and run_time > self.max_run_time: p.terminate() time.sleep(1) self.num_active = len(active_children())
def __init__(self, config): self.config = config loader = PluginLoader(config) plugins, handlers = loader.load_plugins() self.plugins = plugins self.handlers = handlers