Ejemplo n.º 1
0
class ScraperEngine(object):
    def __init__(self, plugin_dir, filename_regexp=None):
        self.plugin_dir = plugin_dir
        self.plugin_loader = PluginLoader(filename_regexp)

    def load_plugins(self):
        self.plugin_loader.load_plugins([self.plugin_dir])

    def run(self):
        for plug in self.plugin_loader.plugins:
            scraper = plug.get_scraper()
            scraper.scrape()
Ejemplo n.º 2
0
class BidScraperEngine(TaskManager):
    def __init__(self,
                 plugin_dir,
                 results_dir,
                 logfile='/var/log/bidmap/bid_scraper_engine.log'):
        self.plugin_dir = plugin_dir
        self.results_dir = results_dir
        self.logfile = logfile
        self.plugins = []
        self.pldr = PluginLoader()

        self.logger = logging.getLogger('bidmap')
        self.logger.setLevel(logging.DEBUG)

        fh = logging.FileHandler(filename=logfile, mode='w')
        fh.setLevel(logging.DEBUG)

        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)

        self.logger.addHandler(fh)

        super(BidScraperEngine, self).__init__()

    def load_plugins(self):
        self.pldr.load_plugins([self.plugin_dir])
        random.shuffle(self.pldr.plugins)

        tasks = [
            PluginRunner(plugin=p,
                         results_dir=self.results_dir,
                         logfile=self.logfile) for p in self.pldr.plugins
        ]
        self.set_task_list(tasks=tasks)

        self.logger.info('loaded %d plugins' % len(self.pldr.plugins))

    def task_launched(self, task, pid):
        self.logger.info('Task %d for plugin %s launched' %
                         (int(pid), task.plugin))

    def task_completed(self, task, pid):
        self.logger.info('Task %d for plugin %s completed' %
                         (int(pid), task.plugin))
Ejemplo n.º 3
0
class ScraperEngine(object):
    def __init__(self, plugin_dir, results_dir='./results/', filename_regexp=None, max_active=5, max_run_time=None):
        self.num_active = 0
        self.max_active = max_active
        self.max_run_time = max_run_time

        self.plugin_loader = PluginLoader(filename_regexp)
        self.plugin_loader.load_plugins([plugin_dir])

        self.process_list = [ PluginProcess(plugin=p, result_dir=results_dir) for p in self.plugin_loader.plugins ]    

    def launch_max_active(self):
        '''
        Launch up to max_active processes to run plugins
        '''
        self.num_active = len(active_children())

        while len(self.process_list) > 0 and self.num_active < self.max_active:
            process = self.process_list.pop(0)
            process.start_time = datetime.now()
            process.start()

            self.num_active = len(active_children())
        
    def run(self):
        while len(self.process_list) > 0 or self.num_active > 0:
            self.launch_max_active()

            #
            # Monitor for hung processes
            #
            if self.max_run_time:
                for p in active_children():
                    run_time = datetime.now() - p.start_time
                    if p.is_alive() and run_time > self.max_run_time:
                        p.terminate()

            time.sleep(1)
            self.num_active = len(active_children())
Ejemplo n.º 4
0
 def __init__(self, config):
     self.config = config
     loader = PluginLoader(config)
     plugins, handlers = loader.load_plugins()
     self.plugins = plugins
     self.handlers = handlers