Beispiel #1
0
    def __init__(self, manager, source_file):
        self.manager = manager
        self.source_file = source_file
        with io.open(source_file, encoding="utf-8") as fh:
            self.config_yaml = fh.read()
            self.config = yaml.safe_load(self.config_yaml)

        self.name = os.path.basename(source_file)
        # YAML keys with undefined values will be parsed as `None`.
        # eg: with the yaml definition `name: `, `config.get("name", "default_value")`
        # will evaluate to `None` instead of `default_value`.
        # So in order to avoid setting `self.name` to `None`, we use `or` to
        # set the default instead of passing it to `config.get()`
        self.name = self.config.get("name") or self.name
        self.validate_name()
        self.description = self.config.get("description") or self.name
        self.category = self.config.get("category") or "scrape"
        self.init_stage = self.config.get("init") or "init"
        self.delay = int(self.config.get("delay") or 0)
        self.expire = int(self.config.get("expire") or settings.EXPIRE) * 84600
        self.stealthy = self.config.get("stealthy") or False
        self.queue = Dataset(conn, self.name)
        self.aggregator_config = self.config.get("aggregator") or {}

        self.stages = {}
        for name, stage in self.config.get("pipeline", {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)
Beispiel #2
0
    def __init__(self, manager, source_file):
        self.manager = manager
        self.source_file = source_file
        with io.open(source_file, encoding="utf-8") as fh:
            self.config_yaml = fh.read()
            self.config = yaml.safe_load(self.config_yaml)

        self.name = os.path.basename(source_file)
        self.name = self.config.get("name", self.name)
        self.validate_name()
        self.description = self.config.get("description", self.name)
        self.category = self.config.get("category", "scrape")
        self.init_stage = self.config.get("init", "init")
        self.delay = int(self.config.get("delay", 0))
        self.expire = int(self.config.get("expire", settings.EXPIRE)) * 84600
        self.stealthy = self.config.get("stealthy", False)
        self.queue = Dataset(conn, self.name)
        self.aggregator_config = self.config.get("aggregator", {})

        self.stages = {}
        for name, stage in self.config.get("pipeline", {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)
Beispiel #3
0
    def __init__(self, manager, source_file):
        self.manager = manager
        self.source_file = source_file
        with io.open(source_file, encoding='utf-8') as fh:
            self.config_yaml = fh.read()
            self.config = yaml.safe_load(self.config_yaml)

        self.name = os.path.basename(source_file)
        self.name = self.config.get('name', self.name)
        self.description = self.config.get('description', self.name)
        self.category = self.config.get('category', 'scrape')
        self.schedule = self.config.get('schedule', 'disabled')
        self.init_stage = self.config.get('init', 'init')
        self.delta = Crawler.SCHEDULES.get(self.schedule)
        self.delay = int(self.config.get('delay', 0))
        self.expire = int(self.config.get('expire', settings.EXPIRE)) * 84600
        self.stealthy = self.config.get('stealthy', False)
        self.queue = Dataset(conn, self.name)
        self.aggregator_config = self.config.get('aggregator', {})

        self.stages = {}
        for name, stage in self.config.get('pipeline', {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)
Beispiel #4
0
def cancel_queue(collection):
    Dataset(kv, collection.foreign_id).cancel()
Beispiel #5
0
def get_status(collection):
    return Dataset(kv, collection.foreign_id).get_status()
Beispiel #6
0
def cancel(dataset):
    """Delete scheduled tasks for given dataset"""
    conn = get_redis()
    Dataset(conn, dataset).cancel()
Beispiel #7
0
def cancel_queue(collection):
    dataset = dataset_from_collection(collection)
    Dataset(kv, dataset).cancel()
Beispiel #8
0
def get_status(collection):
    dataset = dataset_from_collection(collection)
    return Dataset(kv, dataset).get_status()