Exemple #1
0
    def __init__(self, manager, source_file):
        self.manager = manager
        self.source_file = source_file
        with io.open(source_file, encoding="utf-8") as fh:
            self.config_yaml = fh.read()
            self.config = yaml.safe_load(self.config_yaml)

        self.name = os.path.basename(source_file)
        # YAML keys with undefined values will be parsed as `None`.
        # eg: with the yaml definition `name: `, `config.get("name", "default_value")`
        # will evaluate to `None` instead of `default_value`.
        # So in order to avoid setting `self.name` to `None`, we use `or` to
        # set the default instead of passing it to `config.get()`
        self.name = self.config.get("name") or self.name
        self.validate_name()
        self.description = self.config.get("description") or self.name
        self.category = self.config.get("category") or "scrape"
        self.init_stage = self.config.get("init") or "init"
        self.delay = int(self.config.get("delay") or 0)
        self.expire = int(self.config.get("expire") or settings.EXPIRE) * 84600
        self.stealthy = self.config.get("stealthy") or False
        self.queue = Dataset(conn, self.name)
        self.aggregator_config = self.config.get("aggregator") or {}

        self.stages = {}
        for name, stage in self.config.get("pipeline", {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)
Exemple #2
0
 def test_active_dataset_status(self):
     job = Job.create(self.conn, self.dataset)
     stage = job.get_stage("ingest")
     stage.queue({"test": "foo"}, {})
     stage.queue({"test": "bar"}, {})
     status = Dataset.get_active_dataset_status(self.conn)
     assert len(status["datasets"]) == 1
     assert status["total"] == 1
     assert status["datasets"]["test_1"]["pending"] == 2
     job.dataset.cancel()
     status = Dataset.get_active_dataset_status(self.conn)
     assert status["datasets"] == {}
     assert status["total"] == 0
 def test_active_dataset_status(self):
     job = Job.create(self.conn, self.dataset)
     stage = job.get_stage(Stage.INGEST)
     stage.queue({'test': 'foo'}, {})
     stage.queue({'test': 'bar'}, {})
     status = Dataset.get_active_dataset_status(self.conn)
     assert len(status['datasets']) == 1
     assert status['total'] == 1
     assert status['datasets']['test_1']['pending'] == 2
     job.dataset.cancel()
     status = Dataset.get_active_dataset_status(self.conn)
     assert status['datasets'] == {}
     assert status['total'] == 0
Exemple #4
0
    def __init__(self, manager, source_file):
        self.manager = manager
        self.source_file = source_file
        with io.open(source_file, encoding="utf-8") as fh:
            self.config_yaml = fh.read()
            self.config = yaml.safe_load(self.config_yaml)

        self.name = os.path.basename(source_file)
        self.name = self.config.get("name", self.name)
        self.validate_name()
        self.description = self.config.get("description", self.name)
        self.category = self.config.get("category", "scrape")
        self.init_stage = self.config.get("init", "init")
        self.delay = int(self.config.get("delay", 0))
        self.expire = int(self.config.get("expire", settings.EXPIRE)) * 84600
        self.stealthy = self.config.get("stealthy", False)
        self.queue = Dataset(conn, self.name)
        self.aggregator_config = self.config.get("aggregator", {})

        self.stages = {}
        for name, stage in self.config.get("pipeline", {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)
Exemple #5
0
    def __init__(self, manager, source_file):
        self.manager = manager
        self.source_file = source_file
        with io.open(source_file, encoding='utf-8') as fh:
            self.config_yaml = fh.read()
            self.config = yaml.safe_load(self.config_yaml)

        self.name = os.path.basename(source_file)
        self.name = self.config.get('name', self.name)
        self.description = self.config.get('description', self.name)
        self.category = self.config.get('category', 'scrape')
        self.schedule = self.config.get('schedule', 'disabled')
        self.init_stage = self.config.get('init', 'init')
        self.delta = Crawler.SCHEDULES.get(self.schedule)
        self.delay = int(self.config.get('delay', 0))
        self.expire = int(self.config.get('expire', settings.EXPIRE)) * 84600
        self.stealthy = self.config.get('stealthy', False)
        self.queue = Dataset(conn, self.name)
        self.aggregator_config = self.config.get('aggregator', {})

        self.stages = {}
        for name, stage in self.config.get('pipeline', {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)
Exemple #6
0
class Crawler(object):
    """A processing graph that constitutes a crawler."""

    SCHEDULES = {
        "disabled": None,
        "hourly": timedelta(hours=1),
        "daily": timedelta(days=1),
        "weekly": timedelta(weeks=1),
        "monthly": timedelta(weeks=4),
    }

    def __init__(self, manager, source_file):
        self.manager = manager
        self.source_file = source_file
        with io.open(source_file, encoding="utf-8") as fh:
            self.config_yaml = fh.read()
            self.config = yaml.safe_load(self.config_yaml)

        self.name = os.path.basename(source_file)
        # YAML keys with undefined values will be parsed as `None`.
        # eg: with the yaml definition `name: `, `config.get("name", "default_value")`
        # will evaluate to `None` instead of `default_value`.
        # So in order to avoid setting `self.name` to `None`, we use `or` to
        # set the default instead of passing it to `config.get()`
        self.name = self.config.get("name") or self.name
        self.validate_name()
        self.description = self.config.get("description") or self.name
        self.category = self.config.get("category") or "scrape"
        self.init_stage = self.config.get("init") or "init"
        self.delay = int(self.config.get("delay") or 0)
        self.expire = int(self.config.get("expire") or settings.EXPIRE) * 84600
        self.stealthy = self.config.get("stealthy") or False
        self.queue = Dataset(conn, self.name)
        self.aggregator_config = self.config.get("aggregator") or {}

        self.stages = {}
        for name, stage in self.config.get("pipeline", {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)

    def validate_name(self):
        if not re.match(r"^[A-Za-z0-9_-]+$", self.name):
            raise ValueError("Invalid crawler name: %s. "
                             "Allowed characters: A-Za-z0-9_-" % self.name)

    @property
    def aggregator_method(self):
        if self.aggregator_config:
            method = self.aggregator_config.get("method")
            if not method:
                return
            # method A: via a named Python entry point
            func = get_entry_point("memorious.operations", method)
            if func is not None:
                return func
            # method B: direct import from a module
            if ":" in method:
                package, method = method.rsplit(":", 1)
                module = import_module(package)
                return getattr(module, method)
            raise ValueError("Unknown method: %s", self.method_name)

    def aggregate(self, context):
        if self.aggregator_method:
            log.info("Running aggregator for %s" % self.name)
            params = self.aggregator_config.get("params", {})
            self.aggregator_method(context, params)

    def flush(self):
        """Delete all run-time data generated by this crawler."""
        self.queue.cancel()
        Crawl.flush(self)
        self.flush_tags()

    def flush_tags(self):
        tags.delete(prefix=make_key(self, "tag"))

    def cancel(self):
        Crawl.abort_all(self)
        self.queue.cancel()

    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            "crawler": self.name,
            "run_id": run_id or Job.random_id(),
            "incremental": settings.INCREMENTAL,
            "continue_on_error": settings.CONTINUE_ON_ERROR,
        }
        if incremental is not None:
            state["incremental"] = incremental

        # Cancel previous runs:
        self.cancel()
        init_stage = self.get(self.init_stage)
        Queue.queue(init_stage, state, {})

    @property
    def is_running(self):
        """Is the crawler currently running?"""
        for job in self.queue.get_jobs():
            if not job.is_done():
                return True
        return False

    @property
    def last_run(self):
        return Crawl.last_run(self)

    @property
    def op_count(self):
        """Total operations performed for this crawler"""
        return Crawl.op_count(self)

    @property
    def runs(self):
        return Crawl.runs(self)

    @property
    def latest_runid(self):
        return Crawl.latest_runid(self)

    @property
    def pending(self):
        status = self.queue.get_status()
        return status.get("pending")

    def get(self, name):
        return self.stages.get(name)

    def __str__(self):
        return self.name

    def __iter__(self):
        return iter(self.stages.values())

    def __repr__(self):
        return "<Crawler(%s)>" % self.name
Exemple #7
0
class Crawler(object):
    """A processing graph that constitutes a crawler."""
    SCHEDULES = {
        'disabled': None,
        'hourly': timedelta(hours=1),
        'daily': timedelta(days=1),
        'weekly': timedelta(weeks=1),
        'monthly': timedelta(weeks=4)
    }

    def __init__(self, manager, source_file):
        self.manager = manager
        self.source_file = source_file
        with io.open(source_file, encoding='utf-8') as fh:
            self.config_yaml = fh.read()
            self.config = yaml.safe_load(self.config_yaml)

        self.name = os.path.basename(source_file)
        self.name = self.config.get('name', self.name)
        self.description = self.config.get('description', self.name)
        self.category = self.config.get('category', 'scrape')
        self.schedule = self.config.get('schedule', 'disabled')
        self.init_stage = self.config.get('init', 'init')
        self.delta = Crawler.SCHEDULES.get(self.schedule)
        self.delay = int(self.config.get('delay', 0))
        self.expire = int(self.config.get('expire', settings.EXPIRE)) * 84600
        self.stealthy = self.config.get('stealthy', False)
        self.queue = Dataset(conn, self.name)
        self.aggregator_config = self.config.get('aggregator', {})

        self.stages = {}
        for name, stage in self.config.get('pipeline', {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)

    def check_due(self):
        """Check if the last execution of this crawler is older than
        the scheduled interval."""
        if self.is_running:
            return False
        if self.delta is None:
            return False
        last_run = self.last_run
        if last_run is None:
            return True
        now = datetime.utcnow()
        if now > last_run + self.delta:
            return True
        return False

    @property
    def aggregator_method(self):
        if self.aggregator_config:
            method = self.aggregator_config.get("method")
            if not method:
                return
            if ':' in method:
                package, method = method.rsplit(':', 1)
                module = import_module(package)
                return getattr(module, method)

    def aggregate(self, context):
        if self.aggregator_method:
            log.info("Running aggregator for %s" % self.name)
            params = self.aggregator_config.get("params", {})
            self.aggregator_method(context, params)

    def flush(self):
        """Delete all run-time data generated by this crawler."""
        self.queue.cancel()
        Event.delete(self)
        Crawl.flush(self)

    def flush_events(self):
        Event.delete(self)

    def cancel(self):
        Crawl.abort_all(self)
        self.queue.cancel()

    @property
    def should_timeout(self):
        if self.last_run is None:
            return False
        now = datetime.utcnow()
        return self.last_run < now - timedelta(
            seconds=settings.CRAWLER_TIMEOUT)  # noqa

    def timeout(self):
        log.warning("Crawler timed out: %s. Aggregator won't be run",
                    self.name)  # noqa
        self.cancel()

    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            'crawler': self.name,
            'run_id': run_id or Job.random_id(),
            'incremental': settings.INCREMENTAL
        }
        if incremental is not None:
            state['incremental'] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events data but keep the counts:
        Event.delete_data(self)
        Queue.queue(self.init_stage, state, {})

    @property
    def is_running(self):
        """Is the crawler currently running?"""
        for job in self.queue.get_jobs():
            if not job.is_done():
                return True
        return False

    @property
    def last_run(self):
        return Crawl.last_run(self)

    @property
    def op_count(self):
        """Total operations performed for this crawler"""
        return Crawl.op_count(self)

    @property
    def runs(self):
        return Crawl.runs(self)

    @property
    def latest_runid(self):
        return Crawl.latest_runid(self)

    @property
    def pending(self):
        status = self.queue.get_status()
        return status.get('pending')

    def flush_tags(self):
        pipe = conn.pipeline()
        count = 0
        for key in conn.scan_iter(make_key(self, 'tag', '*')):
            pipe.delete(key)
            count += 1
        pipe.execute()
        log.info("Deleted %d tags", count)

    def get(self, name):
        return self.stages.get(name)

    def __str__(self):
        return self.name

    def __iter__(self):
        return iter(self.stages.values())

    def __repr__(self):
        return '<Crawler(%s)>' % self.name
Exemple #8
0
def cancel_queue(collection):
    Dataset(kv, collection.foreign_id).cancel()
Exemple #9
0
def get_active_collection_status():
    data = Dataset.get_active_dataset_status(kv)
    return data
Exemple #10
0
def get_status(collection):
    return Dataset(kv, collection.foreign_id).get_status()
Exemple #11
0
def cancel(dataset):
    """Delete scheduled tasks for given dataset"""
    conn = get_redis()
    Dataset(conn, dataset).cancel()
Exemple #12
0
 def cleanup_jobs(self):
     for dataset in Dataset.get_active_datasets(kv):
         for job in dataset.get_jobs():
             self.cleanup_job(job)
Exemple #13
0
def cancel_queue(collection):
    dataset = dataset_from_collection(collection)
    Dataset(kv, dataset).cancel()
Exemple #14
0
def get_status(collection):
    dataset = dataset_from_collection(collection)
    return Dataset(kv, dataset).get_status()
Exemple #15
0
class Crawler(object):
    """A processing graph that constitutes a crawler."""

    SCHEDULES = {
        "disabled": None,
        "hourly": timedelta(hours=1),
        "daily": timedelta(days=1),
        "weekly": timedelta(weeks=1),
        "monthly": timedelta(weeks=4),
    }

    def __init__(self, manager, source_file):
        self.manager = manager
        self.source_file = source_file
        with io.open(source_file, encoding="utf-8") as fh:
            self.config_yaml = fh.read()
            self.config = yaml.safe_load(self.config_yaml)

        self.name = os.path.basename(source_file)
        self.name = self.config.get("name", self.name)
        self.validate_name()
        self.description = self.config.get("description", self.name)
        self.category = self.config.get("category", "scrape")
        self._schedule = self.config.get("schedule", "disabled")
        self.init_stage = self.config.get("init", "init")
        self.delta = Crawler.SCHEDULES.get(self.schedule)
        self.delay = int(self.config.get("delay", 0))
        self.expire = int(self.config.get("expire", settings.EXPIRE)) * 84600
        self.stealthy = self.config.get("stealthy", False)
        self.queue = Dataset(conn, self.name)
        self.aggregator_config = self.config.get("aggregator", {})

        self.stages = {}
        for name, stage in self.config.get("pipeline", {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)

    def check_due(self):
        """Check if the last execution of this crawler is older than
        the scheduled interval."""
        if self.is_running:
            return False
        if self.delta is None:
            return False
        last_run = self.last_run
        if last_run is None:
            return True
        now = datetime.utcnow()
        if now > last_run + self.delta:
            return True
        return False

    def validate_name(self):
        if not re.match(r"^[A-Za-z0-9_-]+$", self.name):
            raise ValueError("Invalid crawler name: %s. "
                             "Allowed characters: A-Za-z0-9_-" % self.name)

    @property
    def schedule(self):
        schedule = Crawl.get_schedule(self) or self._schedule
        return schedule if schedule in self.SCHEDULES else "disabled"

    @property
    def aggregator_method(self):
        if self.aggregator_config:
            method = self.aggregator_config.get("method")
            if not method:
                return
            # method A: via a named Python entry point
            func = get_entry_point("memorious.operations", method)
            if func is not None:
                return func
            # method B: direct import from a module
            if ":" in method:
                package, method = method.rsplit(":", 1)
                module = import_module(package)
                return getattr(module, method)
            raise ValueError("Unknown method: %s", self.method_name)

    def aggregate(self, context):
        if self.aggregator_method:
            log.info("Running aggregator for %s" % self.name)
            params = self.aggregator_config.get("params", {})
            self.aggregator_method(context, params)

    def flush(self):
        """Delete all run-time data generated by this crawler."""
        self.queue.cancel()
        Event.delete(self)
        Crawl.flush(self)
        self.flush_tags()

    def flush_tags(self):
        tags.delete(prefix=make_key(self, "tag"))

    def flush_events(self):
        Event.delete(self)

    def cancel(self):
        Crawl.abort_all(self)
        self.queue.cancel()

    @property
    def should_timeout(self):
        if self.last_run is None:
            return False
        now = datetime.utcnow()
        return self.last_run < now - timedelta(
            seconds=settings.CRAWLER_TIMEOUT)  # noqa

    def timeout(self):
        log.warning("Crawler timed out: %s. Aggregator won't be run",
                    self.name)  # noqa
        self.cancel()

    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            "crawler": self.name,
            "run_id": run_id or Job.random_id(),
            "incremental": settings.INCREMENTAL,
        }
        if incremental is not None:
            state["incremental"] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events data but keep the counts:
        Event.delete_data(self)
        init_stage = self.get(self.init_stage)
        Queue.queue(init_stage, state, {})

    @property
    def is_running(self):
        """Is the crawler currently running?"""
        for job in self.queue.get_jobs():
            if not job.is_done():
                return True
        return False

    @property
    def last_run(self):
        return Crawl.last_run(self)

    @property
    def op_count(self):
        """Total operations performed for this crawler"""
        return Crawl.op_count(self)

    @property
    def runs(self):
        return Crawl.runs(self)

    @property
    def latest_runid(self):
        return Crawl.latest_runid(self)

    @property
    def pending(self):
        status = self.queue.get_status()
        return status.get("pending")

    def get(self, name):
        return self.stages.get(name)

    def __str__(self):
        return self.name

    def __iter__(self):
        return iter(self.stages.values())

    def __repr__(self):
        return "<Crawler(%s)>" % self.name