def rules_function(ctx, rule, activate): """ Function for ``eurocli rules [...]`` Args: ctx (context): context object rule (int): id('s) of the rule to modify activate (boolean): target state of the rule """ r = Rules(ctx.obj["db"]) if rule: for ru in rule: try: r.update_rule_state(ru, activate) except Exception as e: print(e) values, keys = r.get_rules() table = BeautifulTable() table.columns.header = keys for row in values: table.rows.append(row) click.echo("Europarl Crawler rules:") click.echo(table)
def test_table_not_exists(db_interface): with db_interface.cursor() as db: db.cur.execute( sql.SQL("drop table {table} cascade").format( table=sql.Identifier(Rules.table_name))) rules = Rules(db_interface) assert rules.table_exists() is False
def test_get_todo_rule_and_date_combos_one_rule(db_interface, todo_setup): # valid session url is found and one rule is activated u = URLs(db_interface) ru = Rules(db_interface) s = SessionDay(db_interface) ru.update_rule_state(id=todo_setup["rule_ids"][1], active=True) ret = u.get_todo_rule_and_date_combos(limit=100) assert len(ret) == 1 assert ret[0]["date"] == s.get_date(todo_setup["day_id"])[1] id, name, active = ru.get_rule(todo_setup["rule_ids"][1]) assert ret[0]["rulename"] == name
def startup(self): super().startup() self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"]) self.db = DBInterface(config=self.config) self.db.connection_name = self.name self.urls = URLs(self.db) self.rules = Rules(self.db) self.todo_date_rule_combos = [] self.url_id = None self.url_string = None self.logger.info("{} started".format(self.name))
def test_register_rule_and_get_by_name_id(db_interface, rulenames): rules = Rules(db_interface) registered_rules = [] registered_rules = rules.register_rules(rulenames) for original, stored_id in zip(rulenames, registered_rules): id_by_name, name_by_name, active_by_name = rules.get_rule( rulename=original) assert id_by_name == stored_id assert type(name_by_name) == str assert name_by_name == str(original) assert not active_by_name id_by_id, name_by_id, active_by_id = rules.get_rule(id=stored_id) assert id_by_id == stored_id assert type(name_by_id) == str assert name_by_id == str(original) assert not active_by_id
def main(): config = configuration.read() with Context(config) as main_ctx: create_table_structure(main_ctx.config) db = DBInterface(config=main_ctx.config["DEFAULT"]) Rules(db).register_rules(rule.rule_registry.keys) db.close() # rules.init_rules(main_ctx.config) init_signals(main_ctx.shutdown_event, default_signal_handler, default_signal_handler) token_bucket_q = main_ctx.MPQueue(100) url_q = main_ctx.MPQueue(10) main_ctx.Proc( token_bucket_q, name="SessionDayChecker", worker_class=SessionDayChecker, config=config["SessionDayChecker"], ) for instance_id in range(int(config["Downloader"].get("Instances", 1))): main_ctx.Proc( token_bucket_q, url_q, name="Downloader_{}".format(instance_id), worker_class=DocumentDownloader, config=config["Downloader"], ) main_ctx.Proc( url_q, name="DateUrlGenerator", worker_class=DateUrlGenerator, config=config["DateUrlGenerator"], ) main_ctx.Proc( token_bucket_q, name="TokenGenerator", worker_class=TokenBucketWorker, config=config["TokenBucketWorker"], ) while not main_ctx.shutdown_event.is_set(): event = main_ctx.event_queue.safe_get() if not event: continue
def startup(self): """ Initializes a sessionDay-table instance and a long running requests-session object which will handle all outgoing requests """ super().startup() self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"]) self.db = DBInterface(config=self.config) self.db.connection_name = self.name self.sessionDay = SessionDay(self.db) self.request = Request(self.db) self.rules = Rules(self.db) self.session = requests.Session() self.url, self.url_id = None, None self.sleep_end = datetime.now(timezone.utc) - timedelta(hours=1) self.logger.info("{} started".format(self.name))
def todo_setup(db_interface): u = URLs(db_interface) r = Request(db_interface) ru = Rules(db_interface) s = SessionDay(db_interface) day_id = s.insert_date(date.today()) rule_ids = ru.register_rules(rule_registry.all) session_day_id, name, active = ru.get_rule(rulename="session_day") session_url_id = u.save_url(date_id=day_id, rule_id=session_day_id, url="www.internet.de") rule_ids.remove(session_url_id) r.mark_as_requested(url_id=session_url_id, status_code=200, redirected_url="www.internet1.de") return { "day_id": day_id, "rule_ids": rule_ids, "session_url_id": session_url_id }
def main(): config = configuration.read() with Context(config) as main_ctx: create_table_structure(main_ctx.config) db = DBInterface(config=main_ctx.config["DEFAULT"]) Rules(db).register_rules(rule.rule_registry.all) db.close() init_signals(main_ctx.shutdown_event, default_signal_handler, default_signal_handler) document_q = main_ctx.MPQueue(30) for instance_id in range( int(config["PostProcessingWorker"].get("Instances", 1))): main_ctx.Proc( document_q, name="PostProcessingWorker_{}".format(instance_id), worker_class=PostProcessingWorker, config=config["PostProcessingWorker"], ) main_ctx.Proc( document_q, name="PostProcessingScheduler", worker_class=PostProcessingScheduler, config=config["PostProcessingScheduler"], ) while not main_ctx.shutdown_event.is_set(): event = main_ctx.event_queue.safe_get() if not event: continue
class DateUrlGenerator(ProcWorker): def init_args(self, args): (self.url_q, ) = args def startup(self): super().startup() self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"]) self.db = DBInterface(config=self.config) self.db.connection_name = self.name self.urls = URLs(self.db) self.rules = Rules(self.db) self.todo_date_rule_combos = [] self.url_id = None self.url_string = None self.logger.info("{} started".format(self.name)) def shutdown(self): super().shutdown() def get_new_combos(self, limit): """ Get a list of new rule and date combinations Args: limit (int): amount of combinations that should be retrieved Returns: list: list of combination dictionaries """ self.logger.debug("Getting new date/rule-combinations") combos = self.urls.get_todo_rule_and_date_combos(limit=limit) # got no new combinations from db. sleep for the polling timeout before retrying if len(combos) == 0: time.sleep(self.DEFAULT_POLLING_TIMEOUT) else: self.logger.info("Got {} new combinations from database".format( len(combos))) return combos def create_url(self, combo): """ Creates a url based upon a rule and date combination Args: combo (dict): rule and date combination dictionary Returns: tuple: url_id and url_string """ self.logger.debug("Applying rule: {} to date: {}".format( combo["rulename"], combo["date"])) url_id, url_string = self.rules.apply_rule(date_id=combo["date_id"], rule_id=combo["rule_id"]) self.logger.debug("Result: {}".format(url_string)) return url_id, url_string def enqueue_url(self, url_id, url_string): """ Queues up a URl Args: url_id (int): id of the url url_string (id): url string Returns: tuple of url_id and url_string: values are None if the value was enqueued successfully, the old values stay if this isn't the case """ try: self.logger.debug("Queueing up URL with id: {}".format(url_id)) self.url_q.put(url_id, timeout=self.DEFAULT_POLLING_TIMEOUT) self.logger.info("Queued up URL: {} with id: {}".format( url_string, url_id)) url_string, url_id = None, None except Full: pass return url_id, url_string def main_func(self): """ Continuously enqueue new urls. First block sets up a buffer of date and rule combinations. This buffer is then iteratively consumed with every iteration, urls created, stored and enqueued """ if len(self.todo_date_rule_combos) == 0: self.todo_date_rule_combos = self.get_new_combos( limit=self.PREFETCH_LIMIT) if len(self.todo_date_rule_combos) == 0: time.sleep(self.DEFAULT_POLLING_TIMEOUT * 10) return if self.url_id is None: self.url_id, self.url_string = self.create_url( combo=self.todo_date_rule_combos.pop()) self.url_id, self.url_string = self.enqueue_url( url_id=self.url_id, url_string=self.url_string)
class SessionDayChecker(QueueProcWorker): PREFETCH_LIMIT = 100 def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.dates_to_check = [] def init_args(self, args): (self.work_q, ) = args def startup(self): """ Initializes a sessionDay-table instance and a long running requests-session object which will handle all outgoing requests """ super().startup() self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"]) self.db = DBInterface(config=self.config) self.db.connection_name = self.name self.sessionDay = SessionDay(self.db) self.request = Request(self.db) self.rules = Rules(self.db) self.session = requests.Session() self.url, self.url_id = None, None self.sleep_end = datetime.now(timezone.utc) - timedelta(hours=1) self.logger.info("{} started".format(self.name)) def shutdown(self): """ Cleans up by closing the requests-session and deleting the sessionDay which will close the database connection """ super().shutdown() self.session.close() del self.sessionDay def check_for_sleep(self, current_time, sleep_end): """ This function determines if the main function is still sleeping We don't want to hammer the database with request every 100ms, querying if a new day has started. This function checks if a sleeping time window is over. This information can be used to return the main function early or continue computing. We don't want to block and time.sleep() for the duration of the sleep. This would block and make the process not responsive to the quit signal. Args: current_time (datetime): current execution timestamp delta (timedelta): sleeping duration Returns: boolean: still sleeping? """ self.logger.debug("Checking if sleep time is currently active") self.logger.debug("Sleep time is over at: {}".format(sleep_end)) if current_time < sleep_end: self.logger.debug( "Current time is: {} - Aborting".format(current_time)) return True self.logger.debug( "Current time is: {} - Continuing".format(current_time)) return False def get_new_date(self): try: return self.dates_to_check.pop() except IndexError: self.logger.debug("Querying database for sessions to check") dates = self.sessionDay.get_unchecked_days(self.PREFETCH_LIMIT) self.logger.debug( "Database returned the following dates: {}".format(dates)) if len(dates) > 0: self.dates_to_check = dates return self.dates_to_check.pop() else: self.logger.debug( "Database returned no value, initializing sleep: {}". format(dates)) self.set_sleep(timedelta(minutes=1)) return None def crawl(self, session, date): """ Generates a url to crawls based upon the passed in date and then crawls it. It will return the status code and if crawling the url was a hit or miss. Additionally it will return the generated document_url and the final url after following all redirects Args: session (requests.session): long running session object date (datetime.date): date to generate the url from Returns: Tuple(hit, status_code, generated_url, final_url) """ if self.url is None: date_id = self.sessionDay.insert_date(date) rule = self.rules.get_rule(rulename=SessionDayRule.name) # construct url to crawl self.url_id, self.url = self.rules.apply_rule(rule_id=rule[0], date_id=date_id) self.logger.debug("Crawling url: {}".format(self.url)) try: resp = self.session.head(self.url, allow_redirects=True) self.request.mark_as_requested( url_id=self.url_id, status_code=resp.status_code, redirected_url=resp.url, ) self.logger.debug("Server response: {}".format(resp.status_code)) if resp.status_code == 200: self.logger.info("Identified session on the: {}".format(date)) if resp.status_code == 404: self.logger.info( "Identified no session on the: {}".format(date)) self.url, self.url_id = None, None except requests.ReadTimeout as e: self.logger.warn("Timeout for url: {}".format(self.url)) self.logger.warn("Exception Message: {}".format(e)) self.request.mark_as_requested(url_id=self.url_id, status_code=408, redirected_url=self.url) time.sleep(self.DEFAULT_POLLING_TIMEOUT) return except requests.RequestException as e: self.logger.warn("Request exception for url: {}".format(self.url)) self.logger.warn("Exception Message: {}".format(e)) self.request.mark_as_requested(url_id=self.url_id, status_code=460, redirected_url=self.url) time.sleep(self.DEFAULT_POLLING_TIMEOUT) return def set_sleep(self, delta): """ Sets the sleep timer for the sessiondaychecker Args: delta (datetime.timedelta): Time to sleep """ self.logger.debug( "Setting sleep (next iteration) for: {}".format(delta)) self.sleep_end = datetime.now(tz=timezone.utc) + delta def main_func(self, token): # check if function should return early because it is still sleeping if self.check_for_sleep(current_time=datetime.now(timezone.utc), sleep_end=self.sleep_end): self.logger.debug("Returning token to bucket") # put "consumed token back on queue, because no crawling work was done" try: self.work_q.put(token, timeout=self.DEFAULT_POLLING_TIMEOUT) except Full: pass self.logger.debug("Still sleeping, Returned Token to Bucket") time.sleep(self.DEFAULT_POLLING_TIMEOUT) return # get a date value to operate on and start sleeping cycle if db doesn't return a value date = self.get_new_date() if date is not None: self.logger.debug("Checking date: {}".format(date)) else: self.logger.debug("Database returned no unchecked dates, Retrying") return self.crawl(self.session, date)
def test_register_rule_multiple_times(db_interface): r = Rules(db_interface) id_0 = r.register_rules(rule_registry.all) id_1 = r.register_rules(rule_registry.all) assert id_0 == id_1
def test_get_non_existent_rule(db_interface): rules = Rules(db_interface) assert rules.get_rule(id=10) is None
def test_get_rule_by_nothing(db_interface): rules = Rules(db_interface) with pytest.raises(AttributeError): rules.get_rule()
def test_table_exists(db_interface): rules = Rules(db_interface) assert rules.table_exists()
def rulesFix(db_interface): r = Rules(db_interface) ids = [] ids = r.register_rules(rule_registry.all) return ids