Ejemplo n.º 1
0
    def __init__(self, manager, source_file):
        self.manager = manager
        self.source_file = source_file
        with io.open(source_file, encoding="utf-8") as fh:
            self.config_yaml = fh.read()
            self.config = yaml.safe_load(self.config_yaml)

        self.name = os.path.basename(source_file)
        # YAML keys with undefined values will be parsed as `None`.
        # eg: with the yaml definition `name: `, `config.get("name", "default_value")`
        # will evaluate to `None` instead of `default_value`.
        # So in order to avoid setting `self.name` to `None`, we use `or` to
        # set the default instead of passing it to `config.get()`
        self.name = self.config.get("name") or self.name
        self.validate_name()
        self.description = self.config.get("description") or self.name
        self.category = self.config.get("category") or "scrape"
        self.init_stage = self.config.get("init") or "init"
        self.delay = int(self.config.get("delay") or 0)
        self.expire = int(self.config.get("expire") or settings.EXPIRE) * 84600
        self.stealthy = self.config.get("stealthy") or False
        self.queue = Dataset(conn, self.name)
        self.aggregator_config = self.config.get("aggregator") or {}

        self.stages = {}
        for name, stage in self.config.get("pipeline", {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)
Ejemplo n.º 2
0
 def handle(self, task):
     apply_task_context(task)
     data = task.payload
     stage = CrawlerStage.detach_namespace(task.stage.stage)
     state = task.context
     context = Context.from_state(state, stage)
     context.execute(data)
Ejemplo n.º 3
0
 def after_task(self, task):
     if task.job.is_done():
         stage = CrawlerStage.detach_namespace(task.stage.stage)
         state = task.context
         context = Context.from_state(state, stage)
         context.crawler.aggregate(context)
     self.timeout_expiration_check()
Ejemplo n.º 4
0
    def __init__(self, manager, source_file):
        self.manager = manager
        self.source_file = source_file
        with open(source_file) as fh:
            self.config_yaml = fh.read()
            self.config = yaml.load(self.config_yaml)

        self.name = os.path.basename(source_file)
        self.name = self.config.get('name', self.name)
        self.description = self.config.get('description', self.name)
        self.schedule = self.config.get('schedule')
        self.disabled = self.config.get('disabled', False)
        self.init_stage = self.config.get('init', 'init')
        self.delta = Crawler.SCHEDULES.get(self.schedule)
        self.delay = int(self.config.get('delay', 0))
        self.expire = int(self.config.get('expire', settings.EXPIRE))
        self.stealthy = self.config.get('stealthy', False)

        self.stages = {}
        for name, stage in self.config.get('pipeline', {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)
Ejemplo n.º 5
0
    def __init__(self, manager, source_file):
        self.manager = manager
        self.source_file = source_file
        with io.open(source_file, encoding="utf-8") as fh:
            self.config_yaml = fh.read()
            self.config = yaml.safe_load(self.config_yaml)

        self.name = os.path.basename(source_file)
        self.name = self.config.get("name", self.name)
        self.validate_name()
        self.description = self.config.get("description", self.name)
        self.category = self.config.get("category", "scrape")
        self.init_stage = self.config.get("init", "init")
        self.delay = int(self.config.get("delay", 0))
        self.expire = int(self.config.get("expire", settings.EXPIRE)) * 84600
        self.stealthy = self.config.get("stealthy", False)
        self.queue = Dataset(conn, self.name)
        self.aggregator_config = self.config.get("aggregator", {})

        self.stages = {}
        for name, stage in self.config.get("pipeline", {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)
Ejemplo n.º 6
0
    def __init__(self, manager, source_file):
        self.manager = manager
        self.source_file = source_file
        with io.open(source_file, encoding='utf-8') as fh:
            self.config_yaml = fh.read()
            self.config = yaml.safe_load(self.config_yaml)

        self.name = os.path.basename(source_file)
        self.name = self.config.get('name', self.name)
        self.description = self.config.get('description', self.name)
        self.category = self.config.get('category', 'scrape')
        self.schedule = self.config.get('schedule', 'disabled')
        self.init_stage = self.config.get('init', 'init')
        self.delta = Crawler.SCHEDULES.get(self.schedule)
        self.delay = int(self.config.get('delay', 0))
        self.expire = int(self.config.get('expire', settings.EXPIRE)) * 84600
        self.stealthy = self.config.get('stealthy', False)
        self.queue = Dataset(conn, self.name)
        self.aggregator_config = self.config.get('aggregator', {})

        self.stages = {}
        for name, stage in self.config.get('pipeline', {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)