def __init__(self, manager, source_file): self.manager = manager self.source_file = source_file with io.open(source_file, encoding="utf-8") as fh: self.config_yaml = fh.read() self.config = yaml.safe_load(self.config_yaml) self.name = os.path.basename(source_file) # YAML keys with undefined values will be parsed as `None`. # eg: with the yaml definition `name: `, `config.get("name", "default_value")` # will evaluate to `None` instead of `default_value`. # So in order to avoid setting `self.name` to `None`, we use `or` to # set the default instead of passing it to `config.get()` self.name = self.config.get("name") or self.name self.validate_name() self.description = self.config.get("description") or self.name self.category = self.config.get("category") or "scrape" self.init_stage = self.config.get("init") or "init" self.delay = int(self.config.get("delay") or 0) self.expire = int(self.config.get("expire") or settings.EXPIRE) * 84600 self.stealthy = self.config.get("stealthy") or False self.queue = Dataset(conn, self.name) self.aggregator_config = self.config.get("aggregator") or {} self.stages = {} for name, stage in self.config.get("pipeline", {}).items(): self.stages[name] = CrawlerStage(self, name, stage)
def handle(self, task): apply_task_context(task) data = task.payload stage = CrawlerStage.detach_namespace(task.stage.stage) state = task.context context = Context.from_state(state, stage) context.execute(data)
def after_task(self, task): if task.job.is_done(): stage = CrawlerStage.detach_namespace(task.stage.stage) state = task.context context = Context.from_state(state, stage) context.crawler.aggregate(context) self.timeout_expiration_check()
def __init__(self, manager, source_file): self.manager = manager self.source_file = source_file with open(source_file) as fh: self.config_yaml = fh.read() self.config = yaml.load(self.config_yaml) self.name = os.path.basename(source_file) self.name = self.config.get('name', self.name) self.description = self.config.get('description', self.name) self.schedule = self.config.get('schedule') self.disabled = self.config.get('disabled', False) self.init_stage = self.config.get('init', 'init') self.delta = Crawler.SCHEDULES.get(self.schedule) self.delay = int(self.config.get('delay', 0)) self.expire = int(self.config.get('expire', settings.EXPIRE)) self.stealthy = self.config.get('stealthy', False) self.stages = {} for name, stage in self.config.get('pipeline', {}).items(): self.stages[name] = CrawlerStage(self, name, stage)
def __init__(self, manager, source_file): self.manager = manager self.source_file = source_file with io.open(source_file, encoding="utf-8") as fh: self.config_yaml = fh.read() self.config = yaml.safe_load(self.config_yaml) self.name = os.path.basename(source_file) self.name = self.config.get("name", self.name) self.validate_name() self.description = self.config.get("description", self.name) self.category = self.config.get("category", "scrape") self.init_stage = self.config.get("init", "init") self.delay = int(self.config.get("delay", 0)) self.expire = int(self.config.get("expire", settings.EXPIRE)) * 84600 self.stealthy = self.config.get("stealthy", False) self.queue = Dataset(conn, self.name) self.aggregator_config = self.config.get("aggregator", {}) self.stages = {} for name, stage in self.config.get("pipeline", {}).items(): self.stages[name] = CrawlerStage(self, name, stage)
def __init__(self, manager, source_file): self.manager = manager self.source_file = source_file with io.open(source_file, encoding='utf-8') as fh: self.config_yaml = fh.read() self.config = yaml.safe_load(self.config_yaml) self.name = os.path.basename(source_file) self.name = self.config.get('name', self.name) self.description = self.config.get('description', self.name) self.category = self.config.get('category', 'scrape') self.schedule = self.config.get('schedule', 'disabled') self.init_stage = self.config.get('init', 'init') self.delta = Crawler.SCHEDULES.get(self.schedule) self.delay = int(self.config.get('delay', 0)) self.expire = int(self.config.get('expire', settings.EXPIRE)) * 84600 self.stealthy = self.config.get('stealthy', False) self.queue = Dataset(conn, self.name) self.aggregator_config = self.config.get('aggregator', {}) self.stages = {} for name, stage in self.config.get('pipeline', {}).items(): self.stages[name] = CrawlerStage(self, name, stage)