def test_page_to_pdf_EXCEPTIONS(): with patch('subprocess.run', side_effect=subprocess.TimeoutExpired): exo.controlled_browser.page_to_pdf( url=exo_url.ExoUrl('https://www.ruediger-voigt.eu'), file_path='./fileDownloads/setup-to-fail.pdf', queue_id='foo') with patch('subprocess.run', side_effect=subprocess.CalledProcessError): exo.controlled_browser.page_to_pdf( url=exo_url.ExoUrl('https://www.ruediger-voigt.eu'), file_path='./fileDownloads/setup-to-fail.pdf', queue_id='foo')
def test_add_save_page_text(): # standard call uuid_1 = exo.add_save_page_text( 'https://www.ruediger-voigt.eu/examplefile.txt') exo.delete_from_queue(uuid_1) # standard call with ExoUrl uuid_2a = exo.add_save_page_text( exo_url.ExoUrl('https://www.ruediger-voigt.eu/examplefile.txt')) # force adding a second version of the same task uuid_2b = exo.add_save_page_text( exo_url.ExoUrl('https://www.ruediger-voigt.eu/examplefile.txt'), None, None, True) assert uuid_2b is not None, 'Did not force downloading page text' # clean up exo.delete_from_queue(uuid_2a) exo.delete_from_queue(uuid_2b)
def define_new(self, job_name: str, start_url: Union[exo_url.ExoUrl, str]) -> None: "Create a new crawl job identified by its name and add a start URL." if not job_name: raise ValueError('Provide a valid job_name') if not userprovided.parameters.string_in_range(job_name, 1, 127, True): raise ValueError('job name must be between 1 and 127 characters.') if not start_url: raise ValueError('A job needs a Start URL.') if not isinstance(start_url, exo_url.ExoUrl): start_url = exo_url.ExoUrl(start_url) job_name = job_name.strip() try: self.cur.callproc('define_new_job_SP', (job_name, start_url)) logging.debug('Defined new job.') except pymysql.IntegrityError: # A job with this name already exists # Check if startURL is the same: self.cur.execute('SELECT startURL FROM jobs WHERE jobName = %s;', (job_name, )) response = self.cur.fetchone() existing_start_url = response[ 0] if response else None # type: ignore[index] if existing_start_url != start_url: raise ValueError('A job with the identical name but ' + '*different* startURL is already defined!') logging.warning( 'A job with identical name and startURL is already defined.')
def return_page_code(self, url: Union[exo_url.ExoUrl, str]) -> str: "Immediately return a page's code. Do *not* store it in the database." if not url: raise ValueError('Missing URL') if not isinstance(url, exo_url.ExoUrl): url = exo_url.ExoUrl(url) return self.action.return_page_code(url)
def test_job_manager(): exo.jobs.define_new('Example Job', 'https://www.example.com') # Define the job again with the same parameters # which is ignored except for a log entry exo.jobs.define_new('Example Job', 'https://www.example.com') # Job name too long: with pytest.raises(ValueError): exo.jobs.define_new('foo' * 127, 'https://www.example.com') # try to define job with same name but different start url with pytest.raises(ValueError): exo.jobs.define_new('Example Job', 'https://www.example.com/foo.html') # Missing required job parameters with pytest.raises(ValueError): exo.jobs.define_new('Example Job', '') with pytest.raises(ValueError): exo.jobs.define_new('', 'https://www.example.com') # Update the URL exo.jobs.update_current_url('Example Job', 'https://www.example.com/bar.html') with pytest.raises(ValueError): exo.jobs.update_current_url(None, 'https://www.example.com/bar.html') with pytest.raises(ValueError): exo.jobs.update_current_url('Example Job', None) with pytest.raises(ValueError): exo.jobs.update_current_url('Unknown Job', 'https://www.example.com/') # Get the URL assert exo.jobs.get_current_url( 'Example Job') == 'https://www.example.com/bar.html' with pytest.raises(ValueError): exo.jobs.get_current_url('Unknown Job') # mark a job as finished exo.jobs.mark_as_finished('Example Job') with pytest.raises(ValueError): exo.jobs.mark_as_finished(' ') with pytest.raises(ValueError): exo.jobs.mark_as_finished(None) # try to get the current URL of a finished job with pytest.raises(RuntimeError): exo.jobs.get_current_url('Example Job') # Use methods with ExoUrl object instead of URL string exo.jobs.define_new('ExoUrl Job', exo_url.ExoUrl('https://www.example.com/exourl.html')) exo.jobs.update_current_url( 'ExoUrl Job', exo_url.ExoUrl('https://www.example.com/exourl-2.html')) exo.jobs.mark_as_finished('ExoUrl Job')
def test_return_page_code(): exo.return_page_code('https://www.ruediger-voigt.eu/') exo.return_page_code(exo_url.ExoUrl('https://www.ruediger-voigt.eu/')) with pytest.raises(ValueError) as excinfo: exo.return_page_code(None) assert 'Missing URL' in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: exo.return_page_code("https://www.ruediger-voigt.eu/throw-402.html") assert 'Cannot return page code' in str(excinfo.value)
def filemaster_labels_by_url(self, url: Union[exo_url.ExoUrl, str]) -> set: """Get a list of label names (not id numbers!) attached to a specific filemaster entry using the URL associated.""" if not isinstance(url, exo_url.ExoUrl): url = exo_url.ExoUrl(url) self.cur.callproc('labels_filemaster_by_url_SP', (url, )) labels = self.cur.fetchall() return {(label[0]) for label in labels } if labels else set() # type: ignore[index]
def get_filemaster_id_by_url( self, url: Union[exo_url.ExoUrl, str]) -> Optional[str]: "Get the id of the filemaster entry associated with this URL" if not isinstance(url, exo_url.ExoUrl): url = exo_url.ExoUrl(url) self.cur.execute( 'SELECT id FROM fileMaster ' + 'WHERE urlHash = SHA2(%s,256);', (url, )) id_in_file_master = self.cur.fetchone() return id_in_file_master[ 0] if id_in_file_master else None # type: ignore[index]
def add_save_page_code(self, url: Union[exo_url.ExoUrl, str], labels_master: set = None, labels_version: set = None, prettify_html: bool = False, force_new_version: bool = False) -> Optional[str]: "Add an URL to the queue to save its HTML code into the database." if not isinstance(url, exo_url.ExoUrl): url = exo_url.ExoUrl(url) uuid = self.queue.add_to_queue(url, 2, labels_master, labels_version, prettify_html, force_new_version) return uuid
def add_file_download(self, url: Union[exo_url.ExoUrl, str], labels_master: set = None, labels_version: set = None, force_new_version: bool = False) -> Optional[str]: "Add a file download URL to the queue" if not isinstance(url, exo_url.ExoUrl): url = exo_url.ExoUrl(url) uuid = self.queue.add_to_queue(url, 1, labels_master, labels_version, False, force_new_version) return uuid
def test_add_save_page_code(): # malformed URL: must not be added to the queue with pytest.raises(ValueError) as excinfo: _ = exo.add_save_page_code('missingschema.example.com') assert 'Malformed' in str(excinfo.value) # add task uuid_1 = exo.add_save_page_code('https://www.ruediger-voigt.eu/') exo.delete_from_queue(uuid_1) # add task with ExoUrl uuid_2 = exo.add_save_page_code( exo_url.ExoUrl('https://www.ruediger-voigt.eu/')) exo.delete_from_queue(uuid_2)
def update_current_url(self, job_name: str, current_url: Union[exo_url.ExoUrl, str]) -> None: "Set the currentUrl for a specific job. " if not job_name: raise ValueError('Provide the job name.') if not current_url: raise ValueError('Current URL must not be empty.') if not isinstance(current_url, exo_url.ExoUrl): current_url = exo_url.ExoUrl(current_url) # execute returns affected rows, callproc does not affected_rows = self.cur.execute( 'CALL job_update_current_url_SP(%s, %s);', (job_name, current_url)) if affected_rows == 0: raise ValueError('A job with this name is not known.')
def add_save_page_text(self, url: Union[exo_url.ExoUrl, str], labels_master: set = None, labels_version: set = None, force_new_version: bool = False) -> Optional[str]: """Add the task 'Extract the text (not the code) from a HTML page and store it into the database' to the queue. This can be useful for some language processing tasks, but compared to add_save_page_code this removes the possiblity to work on a specific part using a CSS selector.""" if not isinstance(url, exo_url.ExoUrl): url = exo_url.ExoUrl(url) uuid = self.queue.add_to_queue(url, 4, labels_master, labels_version, True, force_new_version) return uuid
def test_add_file_download(): # unsupported protocol with pytest.raises(ValueError) as excinfo: _ = exo.add_file_download('ftp://www.ruediger-voigt.eu/') assert 'invalid or unsupported' in str(excinfo.value) # standard add uuid_1 = exo.add_file_download( 'https://www.ruediger-voigt.eu/examplefile.txt') assert uuid_1 is not None, 'File download was not added' exo.delete_from_queue(uuid_1) # check with exoUrl uuid_2 = exo.add_file_download( exo_url.ExoUrl('https://www.example.com/exo-url-test.html')) assert uuid_2 is not None, 'File download was not added' exo.delete_from_queue(uuid_2)
def add_page_to_pdf(self, url: Union[exo_url.ExoUrl, str], labels_master: set = None, labels_version: set = None, force_new_version: bool = False) -> Optional[str]: "Add an URL to the queue to print it to PDF with headless Chrome. " if not isinstance(url, exo_url.ExoUrl): url = exo_url.ExoUrl(url) if not self.controlled_browser.browser_present: logging.warning( 'Will add this task to the queue, but without Chrome or ' + 'Chromium it cannot run! Provide the path to the ' + 'executable when you initialize exoskeleton.') uuid = self.queue.add_to_queue(url, 3, labels_master, labels_version, False, force_new_version) return uuid
def assign_labels_to_master(self, url: Union[exo_url.ExoUrl, str], labels: set) -> None: """ Assigns one or multiple labels to the *fileMaster* entry. Removes duplicates and adds new labels to the label list if necessary.""" if not labels: return None if not isinstance(url, exo_url.ExoUrl): url = exo_url.ExoUrl(url) # Using a set to avoid duplicates. However, accept either # a single string or a list type. label_set = userprovided.parameters.convert_to_set(labels) for label in label_set: # Make sure all labels are in the database table. # -> If they already exist or are malformed, the command # will be ignored by the DBMS. self.define_new_label(label) # Get all label-ids id_list = self.get_label_ids(label_set) # Check whether some labels are already associated # with the fileMaster entry. self.cur.execute( 'SELECT labelID ' + 'FROM labelToMaster ' + 'WHERE urlHash = SHA2(%s,256);', (url, )) ids_found: Optional[tuple] = self.cur.fetchall() ids_associated = set() if ids_found: ids_associated = set(ids_found) # ignore all labels already associated: remaining_ids = tuple(id_list - ids_associated) if len(remaining_ids) > 0: # Case: there are new labels # Convert into a format to INSERT with executemany insert_list = [(id, url) for id in remaining_ids] # Add those associatons self.cur.executemany( 'INSERT IGNORE INTO labelToMaster ' + '(labelID, urlHash) ' + 'VALUES (%s, SHA2(%s,256));', insert_list) return None
def test_assign_labels_to_master(): # Add a task without any filemaster label test_url = 'https://www.example.com/assign-label-to-fm.html' test_uuid = exo.add_page_to_pdf(test_url) # test guard exo.labels.assign_labels_to_master(test_url, set()) is None exo.labels.assign_labels_to_master(test_url, None) is None # Now use the URL to add some labels at filemaster level fm_labels = {'assign_to_fm_test_1', 'assign_to_fm_test_2'} exo.labels.assign_labels_to_master(test_url, fm_labels) # pull the labels and compare them assert exo.labels.filemaster_labels_by_url(test_url) == fm_labels # repeat with ExoUrl assert exo.labels.filemaster_labels_by_url( exo_url.ExoUrl(test_url)) == fm_labels # clean up exo.delete_from_queue(test_uuid) test_counter['num_expected_labels'] += 2
def test_exo_url_DUNDERS(): url_str = 'https://www.example.com' myUrl = exo_url.ExoUrl(url_str) assert str(myUrl) == url_str
def test_exo_url_GUARDS(): with pytest.raises(ValueError) as excinfo: exo_url.ExoUrl(None) assert "Missing URL" in str(excinfo.value)
def test_log_rate_limit_hit(): test_url = 'https://www.example.com' exo.stats.log_rate_limit_hit(exo_url.ExoUrl(test_url)) exo.errorhandling.forget_permanent_errors()
def test_add_page_to_pdf(): uuid_1 = exo.add_page_to_pdf('https://www.example.com') uuid_2 = exo.add_page_to_pdf(exo_url.ExoUrl('https://www.example.com')) # clean up exo.delete_from_queue(uuid_1) exo.delete_from_queue(uuid_2)
def test_exo_url_generate_sha256_hash(url: str): hash_python = exo_url.ExoUrl(url).hash exo.cur.execute('SELECT SHA2(%s, 256);', (url, )) hash_db = exo.cur.fetchone()[0] assert hash_python == hash_db
def process_queue(self) -> None: "Process the queue" self.stats.log_queue_stats() while True: try: next_in_queue = self.get_next_task() except pymysql.err.OperationalError as op_err: if op_err.args[0] == 2013: # errno # this error is unusual. Give the db some time: logging.error('Lost database connection. ' + 'Trying to restore it in 10 seconds ...') time.sleep(10) try: self.cur = self.db_connection.get_cursor() next_in_queue = self.get_next_task() logging.info('Restored database connection!') except Exception as exc: msg = 'Could not reestablish database connection' logging.exception(msg, exc_info=True) self.notify.send_msg_abort_lost_db() raise ConnectionError(msg) from exc else: logging.error('Unexpected Operational Error', exc_info=True) raise if next_in_queue is None: # no actionable item in the queue if self.stop_if_queue_empty: # Bot is configured to stop if queue is empty # => check if that is only temporary or everything is done if self.stats.num_tasks_w_temporary_errors() > 0: # there are still tasks, but they have to wait logging.debug( "Tasks with temporary errors: " + "waiting %s seconds until next try.", self.queue_revisit) time.sleep(self.queue_revisit) continue # Nothing left (i.e. num_temp_errors == 0) logging.info('Queue empty. Bot stops as configured.') num_permanent_errors = self.stats.num_tasks_w_permanent_errors( ) if num_permanent_errors > 0: logging.error("%s permanent errors!", num_permanent_errors) self.notify.send_msg_finish() break logging.debug( "No actionable task: waiting %s seconds until next check", self.queue_revisit) time.sleep(self.queue_revisit) continue # Got a task from the queue! queue_id = next_in_queue[0] action = next_in_queue[1] url = exo_url.ExoUrl(next_in_queue[2]) prettify_html = (next_in_queue[4] == 1) # The FQDN might have been added to the blocklist *after* # the task entered into the queue! if self.blocklist.check_blocklist(str(url.hostname)): logging.error( 'Cannot process queue item: FQDN meanwhile on blocklist!') self.delete_from_queue(queue_id) logging.info('Removed item from queue: FQDN on blocklist.') else: if action == 1: # download file to disk self.actions.get_object(queue_id, 'file', url) elif action == 2: # save page code into database self.actions.get_object(queue_id, 'content', url, prettify_html) elif action == 3: # headless Chrome to create PDF self.actions.get_object(queue_id, 'page_to_pdf', url) elif action == 4: # save page text into database self.actions.get_object(queue_id, 'text', url) else: logging.error('Unknown action id!') self.notify.send_msg_milestone() # wait some interval to avoid overloading the server self.time.random_wait()
def test_log_temporary_problem(): test_url = 'https://www.example.com' exo.stats.log_temporary_problem(exo_url.ExoUrl(test_url)) exo.errorhandling.forget_temporary_errors()