Esempio n. 1
0
def rules_function(ctx, rule, activate):
    """
    Function for ``eurocli rules [...]``

    Args:
        ctx (context): context object
        rule (int): id('s) of the rule to modify
        activate (boolean): target state of the rule
    """
    r = Rules(ctx.obj["db"])

    if rule:
        for ru in rule:
            try:
                r.update_rule_state(ru, activate)
            except Exception as e:
                print(e)

    values, keys = r.get_rules()
    table = BeautifulTable()
    table.columns.header = keys
    for row in values:
        table.rows.append(row)
    click.echo("Europarl Crawler rules:")
    click.echo(table)
Esempio n. 2
0
def test_table_not_exists(db_interface):
    with db_interface.cursor() as db:
        db.cur.execute(
            sql.SQL("drop table {table} cascade").format(
                table=sql.Identifier(Rules.table_name)))

    rules = Rules(db_interface)
    assert rules.table_exists() is False
Esempio n. 3
0
def test_get_todo_rule_and_date_combos_one_rule(db_interface, todo_setup):
    # valid session url is found and one rule is activated
    u = URLs(db_interface)
    ru = Rules(db_interface)
    s = SessionDay(db_interface)
    ru.update_rule_state(id=todo_setup["rule_ids"][1], active=True)
    ret = u.get_todo_rule_and_date_combos(limit=100)
    assert len(ret) == 1
    assert ret[0]["date"] == s.get_date(todo_setup["day_id"])[1]
    id, name, active = ru.get_rule(todo_setup["rule_ids"][1])
    assert ret[0]["rulename"] == name
Esempio n. 4
0
    def startup(self):
        super().startup()

        self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"])

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name

        self.urls = URLs(self.db)
        self.rules = Rules(self.db)

        self.todo_date_rule_combos = []
        self.url_id = None
        self.url_string = None
        self.logger.info("{} started".format(self.name))
Esempio n. 5
0
def test_register_rule_and_get_by_name_id(db_interface, rulenames):
    rules = Rules(db_interface)
    registered_rules = []

    registered_rules = rules.register_rules(rulenames)

    for original, stored_id in zip(rulenames, registered_rules):
        id_by_name, name_by_name, active_by_name = rules.get_rule(
            rulename=original)
        assert id_by_name == stored_id
        assert type(name_by_name) == str

        assert name_by_name == str(original)
        assert not active_by_name

        id_by_id, name_by_id, active_by_id = rules.get_rule(id=stored_id)
        assert id_by_id == stored_id
        assert type(name_by_id) == str
        assert name_by_id == str(original)
        assert not active_by_id
Esempio n. 6
0
def main():
    config = configuration.read()

    with Context(config) as main_ctx:

        create_table_structure(main_ctx.config)

        db = DBInterface(config=main_ctx.config["DEFAULT"])
        Rules(db).register_rules(rule.rule_registry.keys)
        db.close()

        # rules.init_rules(main_ctx.config)

        init_signals(main_ctx.shutdown_event, default_signal_handler,
                     default_signal_handler)

        token_bucket_q = main_ctx.MPQueue(100)
        url_q = main_ctx.MPQueue(10)

        main_ctx.Proc(
            token_bucket_q,
            name="SessionDayChecker",
            worker_class=SessionDayChecker,
            config=config["SessionDayChecker"],
        )

        for instance_id in range(int(config["Downloader"].get("Instances",
                                                              1))):
            main_ctx.Proc(
                token_bucket_q,
                url_q,
                name="Downloader_{}".format(instance_id),
                worker_class=DocumentDownloader,
                config=config["Downloader"],
            )

        main_ctx.Proc(
            url_q,
            name="DateUrlGenerator",
            worker_class=DateUrlGenerator,
            config=config["DateUrlGenerator"],
        )
        main_ctx.Proc(
            token_bucket_q,
            name="TokenGenerator",
            worker_class=TokenBucketWorker,
            config=config["TokenBucketWorker"],
        )

        while not main_ctx.shutdown_event.is_set():
            event = main_ctx.event_queue.safe_get()
            if not event:
                continue
Esempio n. 7
0
    def startup(self):
        """
        Initializes a sessionDay-table instance and a long running requests-session object which will handle all outgoing requests
        """
        super().startup()

        self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"])

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name

        self.sessionDay = SessionDay(self.db)
        self.request = Request(self.db)
        self.rules = Rules(self.db)

        self.session = requests.Session()

        self.url, self.url_id = None, None

        self.sleep_end = datetime.now(timezone.utc) - timedelta(hours=1)

        self.logger.info("{} started".format(self.name))
Esempio n. 8
0
def todo_setup(db_interface):
    u = URLs(db_interface)
    r = Request(db_interface)
    ru = Rules(db_interface)
    s = SessionDay(db_interface)

    day_id = s.insert_date(date.today())
    rule_ids = ru.register_rules(rule_registry.all)
    session_day_id, name, active = ru.get_rule(rulename="session_day")
    session_url_id = u.save_url(date_id=day_id,
                                rule_id=session_day_id,
                                url="www.internet.de")
    rule_ids.remove(session_url_id)

    r.mark_as_requested(url_id=session_url_id,
                        status_code=200,
                        redirected_url="www.internet1.de")

    return {
        "day_id": day_id,
        "rule_ids": rule_ids,
        "session_url_id": session_url_id
    }
def main():
    config = configuration.read()

    with Context(config) as main_ctx:

        create_table_structure(main_ctx.config)

        db = DBInterface(config=main_ctx.config["DEFAULT"])
        Rules(db).register_rules(rule.rule_registry.all)
        db.close()

        init_signals(main_ctx.shutdown_event, default_signal_handler,
                     default_signal_handler)

        document_q = main_ctx.MPQueue(30)

        for instance_id in range(
                int(config["PostProcessingWorker"].get("Instances", 1))):
            main_ctx.Proc(
                document_q,
                name="PostProcessingWorker_{}".format(instance_id),
                worker_class=PostProcessingWorker,
                config=config["PostProcessingWorker"],
            )

        main_ctx.Proc(
            document_q,
            name="PostProcessingScheduler",
            worker_class=PostProcessingScheduler,
            config=config["PostProcessingScheduler"],
        )

        while not main_ctx.shutdown_event.is_set():
            event = main_ctx.event_queue.safe_get()
            if not event:
                continue
Esempio n. 10
0
class DateUrlGenerator(ProcWorker):
    def init_args(self, args):
        (self.url_q, ) = args

    def startup(self):
        super().startup()

        self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"])

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name

        self.urls = URLs(self.db)
        self.rules = Rules(self.db)

        self.todo_date_rule_combos = []
        self.url_id = None
        self.url_string = None
        self.logger.info("{} started".format(self.name))

    def shutdown(self):
        super().shutdown()

    def get_new_combos(self, limit):
        """
        Get a list of new rule and date combinations

        Args:
            limit (int): amount of combinations that should be retrieved

        Returns:
            list: list of combination dictionaries
        """
        self.logger.debug("Getting new date/rule-combinations")

        combos = self.urls.get_todo_rule_and_date_combos(limit=limit)

        # got no new combinations from db. sleep for the polling timeout before retrying
        if len(combos) == 0:
            time.sleep(self.DEFAULT_POLLING_TIMEOUT)
        else:
            self.logger.info("Got {} new combinations from database".format(
                len(combos)))

        return combos

    def create_url(self, combo):
        """
        Creates a url based upon a rule and date combination

        Args:
            combo (dict): rule and date combination dictionary

        Returns:
            tuple: url_id and url_string
        """
        self.logger.debug("Applying rule: {} to date: {}".format(
            combo["rulename"], combo["date"]))
        url_id, url_string = self.rules.apply_rule(date_id=combo["date_id"],
                                                   rule_id=combo["rule_id"])
        self.logger.debug("Result: {}".format(url_string))
        return url_id, url_string

    def enqueue_url(self, url_id, url_string):
        """
        Queues up a URl

        Args:
            url_id (int): id of the url
            url_string (id): url string

        Returns:
            tuple of url_id and url_string: values are None if the value was enqueued successfully, the old values stay if this isn't the case
        """
        try:
            self.logger.debug("Queueing up URL with id: {}".format(url_id))
            self.url_q.put(url_id, timeout=self.DEFAULT_POLLING_TIMEOUT)
            self.logger.info("Queued up URL: {} with id: {}".format(
                url_string, url_id))
            url_string, url_id = None, None
        except Full:
            pass

        return url_id, url_string

    def main_func(self):
        """
        Continuously enqueue new urls.
        First block sets up a buffer of date and rule combinations.
        This buffer is then iteratively consumed with every iteration, urls created, stored and enqueued
        """

        if len(self.todo_date_rule_combos) == 0:
            self.todo_date_rule_combos = self.get_new_combos(
                limit=self.PREFETCH_LIMIT)
            if len(self.todo_date_rule_combos) == 0:
                time.sleep(self.DEFAULT_POLLING_TIMEOUT * 10)

            return

        if self.url_id is None:
            self.url_id, self.url_string = self.create_url(
                combo=self.todo_date_rule_combos.pop())

        self.url_id, self.url_string = self.enqueue_url(
            url_id=self.url_id, url_string=self.url_string)
Esempio n. 11
0
class SessionDayChecker(QueueProcWorker):

    PREFETCH_LIMIT = 100

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.dates_to_check = []

    def init_args(self, args):
        (self.work_q, ) = args

    def startup(self):
        """
        Initializes a sessionDay-table instance and a long running requests-session object which will handle all outgoing requests
        """
        super().startup()

        self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"])

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name

        self.sessionDay = SessionDay(self.db)
        self.request = Request(self.db)
        self.rules = Rules(self.db)

        self.session = requests.Session()

        self.url, self.url_id = None, None

        self.sleep_end = datetime.now(timezone.utc) - timedelta(hours=1)

        self.logger.info("{} started".format(self.name))

    def shutdown(self):
        """
        Cleans up by closing the requests-session and deleting the
        sessionDay which will close the database connection
        """
        super().shutdown()
        self.session.close()
        del self.sessionDay

    def check_for_sleep(self, current_time, sleep_end):
        """
        This function determines if the main function is still sleeping
        We don't want to hammer the database with request every 100ms, querying
        if a new day has started. This function checks if a sleeping time window is over. This information can be used to return the main function early or continue computing.

        We don't want to block and time.sleep() for the duration of the sleep.
        This would block and make the process not responsive to the quit signal.

        Args:
            current_time (datetime): current execution timestamp
            delta (timedelta): sleeping duration

        Returns:
            boolean: still sleeping?
        """

        self.logger.debug("Checking if sleep time is currently active")
        self.logger.debug("Sleep time is over at: {}".format(sleep_end))
        if current_time < sleep_end:
            self.logger.debug(
                "Current time is: {} - Aborting".format(current_time))
            return True
        self.logger.debug(
            "Current time is: {} - Continuing".format(current_time))
        return False

    def get_new_date(self):
        try:
            return self.dates_to_check.pop()
        except IndexError:
            self.logger.debug("Querying database for sessions to check")
            dates = self.sessionDay.get_unchecked_days(self.PREFETCH_LIMIT)
            self.logger.debug(
                "Database returned the following dates: {}".format(dates))
            if len(dates) > 0:
                self.dates_to_check = dates
                return self.dates_to_check.pop()
            else:
                self.logger.debug(
                    "Database returned no value, initializing sleep: {}".
                    format(dates))
                self.set_sleep(timedelta(minutes=1))
                return None

    def crawl(self, session, date):
        """
        Generates a url to crawls based upon the passed in
        date and then crawls it.
        It will return the status code and if crawling the url
        was a hit or miss. Additionally it will return the
        generated document_url and the final url after following
        all redirects

        Args:
            session (requests.session): long running session object
            date (datetime.date): date to generate the url from

        Returns:
            Tuple(hit, status_code, generated_url, final_url)
        """

        if self.url is None:
            date_id = self.sessionDay.insert_date(date)
            rule = self.rules.get_rule(rulename=SessionDayRule.name)
            # construct url to crawl
            self.url_id, self.url = self.rules.apply_rule(rule_id=rule[0],
                                                          date_id=date_id)

        self.logger.debug("Crawling url: {}".format(self.url))

        try:
            resp = self.session.head(self.url, allow_redirects=True)

            self.request.mark_as_requested(
                url_id=self.url_id,
                status_code=resp.status_code,
                redirected_url=resp.url,
            )
            self.logger.debug("Server response: {}".format(resp.status_code))

            if resp.status_code == 200:
                self.logger.info("Identified session on the: {}".format(date))

            if resp.status_code == 404:
                self.logger.info(
                    "Identified no session on the: {}".format(date))

            self.url, self.url_id = None, None

        except requests.ReadTimeout as e:

            self.logger.warn("Timeout for url: {}".format(self.url))
            self.logger.warn("Exception Message: {}".format(e))

            self.request.mark_as_requested(url_id=self.url_id,
                                           status_code=408,
                                           redirected_url=self.url)
            time.sleep(self.DEFAULT_POLLING_TIMEOUT)
            return

        except requests.RequestException as e:
            self.logger.warn("Request exception for url: {}".format(self.url))
            self.logger.warn("Exception Message: {}".format(e))
            self.request.mark_as_requested(url_id=self.url_id,
                                           status_code=460,
                                           redirected_url=self.url)
            time.sleep(self.DEFAULT_POLLING_TIMEOUT)
            return

    def set_sleep(self, delta):
        """
        Sets the sleep timer for the sessiondaychecker

        Args:
            delta (datetime.timedelta): Time to sleep
        """
        self.logger.debug(
            "Setting sleep (next iteration) for: {}".format(delta))
        self.sleep_end = datetime.now(tz=timezone.utc) + delta

    def main_func(self, token):

        # check if function should return early because it is still sleeping
        if self.check_for_sleep(current_time=datetime.now(timezone.utc),
                                sleep_end=self.sleep_end):
            self.logger.debug("Returning token to bucket")
            # put "consumed token back on queue, because no crawling work was done"
            try:
                self.work_q.put(token, timeout=self.DEFAULT_POLLING_TIMEOUT)
            except Full:
                pass
            self.logger.debug("Still sleeping, Returned Token to Bucket")
            time.sleep(self.DEFAULT_POLLING_TIMEOUT)
            return

        # get a date value to operate on and start sleeping cycle if db doesn't return a value
        date = self.get_new_date()
        if date is not None:
            self.logger.debug("Checking date: {}".format(date))
        else:
            self.logger.debug("Database returned no unchecked dates, Retrying")
            return

        self.crawl(self.session, date)
Esempio n. 12
0
def test_register_rule_multiple_times(db_interface):
    r = Rules(db_interface)
    id_0 = r.register_rules(rule_registry.all)
    id_1 = r.register_rules(rule_registry.all)
    assert id_0 == id_1
Esempio n. 13
0
def test_get_non_existent_rule(db_interface):
    rules = Rules(db_interface)
    assert rules.get_rule(id=10) is None
Esempio n. 14
0
def test_get_rule_by_nothing(db_interface):
    rules = Rules(db_interface)

    with pytest.raises(AttributeError):
        rules.get_rule()
Esempio n. 15
0
def test_table_exists(db_interface):
    rules = Rules(db_interface)
    assert rules.table_exists()
Esempio n. 16
0
def rulesFix(db_interface):
    r = Rules(db_interface)
    ids = []

    ids = r.register_rules(rule_registry.all)
    return ids