Beispiel #1
0
def test_page_to_pdf_EXCEPTIONS():
    with patch('subprocess.run', side_effect=subprocess.TimeoutExpired):
        exo.controlled_browser.page_to_pdf(
            url=exo_url.ExoUrl('https://www.ruediger-voigt.eu'),
            file_path='./fileDownloads/setup-to-fail.pdf',
            queue_id='foo')
    with patch('subprocess.run', side_effect=subprocess.CalledProcessError):
        exo.controlled_browser.page_to_pdf(
            url=exo_url.ExoUrl('https://www.ruediger-voigt.eu'),
            file_path='./fileDownloads/setup-to-fail.pdf',
            queue_id='foo')
Beispiel #2
0
def test_add_save_page_text():
    # standard call
    uuid_1 = exo.add_save_page_text(
        'https://www.ruediger-voigt.eu/examplefile.txt')
    exo.delete_from_queue(uuid_1)
    # standard call with ExoUrl
    uuid_2a = exo.add_save_page_text(
        exo_url.ExoUrl('https://www.ruediger-voigt.eu/examplefile.txt'))
    # force adding a second version of the same task
    uuid_2b = exo.add_save_page_text(
        exo_url.ExoUrl('https://www.ruediger-voigt.eu/examplefile.txt'), None,
        None, True)
    assert uuid_2b is not None, 'Did not force downloading page text'
    # clean up
    exo.delete_from_queue(uuid_2a)
    exo.delete_from_queue(uuid_2b)
Beispiel #3
0
 def define_new(self, job_name: str, start_url: Union[exo_url.ExoUrl,
                                                      str]) -> None:
     "Create a new crawl job identified by its name and add a start URL."
     if not job_name:
         raise ValueError('Provide a valid job_name')
     if not userprovided.parameters.string_in_range(job_name, 1, 127, True):
         raise ValueError('job name must be between 1 and 127 characters.')
     if not start_url:
         raise ValueError('A job needs a Start URL.')
     if not isinstance(start_url, exo_url.ExoUrl):
         start_url = exo_url.ExoUrl(start_url)
     job_name = job_name.strip()
     try:
         self.cur.callproc('define_new_job_SP', (job_name, start_url))
         logging.debug('Defined new job.')
     except pymysql.IntegrityError:
         # A job with this name already exists
         # Check if startURL is the same:
         self.cur.execute('SELECT startURL FROM jobs WHERE jobName = %s;',
                          (job_name, ))
         response = self.cur.fetchone()
         existing_start_url = response[
             0] if response else None  # type: ignore[index]
         if existing_start_url != start_url:
             raise ValueError('A job with the identical name but ' +
                              '*different* startURL is already defined!')
         logging.warning(
             'A job with identical name and startURL is already defined.')
Beispiel #4
0
 def return_page_code(self,
                      url: Union[exo_url.ExoUrl, str]) -> str:
     "Immediately return a page's code. Do *not* store it in the database."
     if not url:
         raise ValueError('Missing URL')
     if not isinstance(url, exo_url.ExoUrl):
         url = exo_url.ExoUrl(url)
     return self.action.return_page_code(url)
Beispiel #5
0
def test_job_manager():
    exo.jobs.define_new('Example Job', 'https://www.example.com')
    # Define the job again with the same parameters
    # which is ignored except for a log entry
    exo.jobs.define_new('Example Job', 'https://www.example.com')
    # Job name too long:
    with pytest.raises(ValueError):
        exo.jobs.define_new('foo' * 127, 'https://www.example.com')
    # try to define job with same name but different start url
    with pytest.raises(ValueError):
        exo.jobs.define_new('Example Job', 'https://www.example.com/foo.html')
    # Missing required job parameters
    with pytest.raises(ValueError):
        exo.jobs.define_new('Example Job', '')
    with pytest.raises(ValueError):
        exo.jobs.define_new('', 'https://www.example.com')
    # Update the URL
    exo.jobs.update_current_url('Example Job',
                                'https://www.example.com/bar.html')
    with pytest.raises(ValueError):
        exo.jobs.update_current_url(None, 'https://www.example.com/bar.html')
    with pytest.raises(ValueError):
        exo.jobs.update_current_url('Example Job', None)
    with pytest.raises(ValueError):
        exo.jobs.update_current_url('Unknown Job', 'https://www.example.com/')
    # Get the URL
    assert exo.jobs.get_current_url(
        'Example Job') == 'https://www.example.com/bar.html'
    with pytest.raises(ValueError):
        exo.jobs.get_current_url('Unknown Job')
    # mark a job as finished
    exo.jobs.mark_as_finished('Example Job')
    with pytest.raises(ValueError):
        exo.jobs.mark_as_finished('   ')
    with pytest.raises(ValueError):
        exo.jobs.mark_as_finished(None)
    # try to get the current URL of a finished job
    with pytest.raises(RuntimeError):
        exo.jobs.get_current_url('Example Job')
    # Use methods with ExoUrl object instead of URL string
    exo.jobs.define_new('ExoUrl Job',
                        exo_url.ExoUrl('https://www.example.com/exourl.html'))
    exo.jobs.update_current_url(
        'ExoUrl Job', exo_url.ExoUrl('https://www.example.com/exourl-2.html'))
    exo.jobs.mark_as_finished('ExoUrl Job')
Beispiel #6
0
def test_return_page_code():
    exo.return_page_code('https://www.ruediger-voigt.eu/')
    exo.return_page_code(exo_url.ExoUrl('https://www.ruediger-voigt.eu/'))
    with pytest.raises(ValueError) as excinfo:
        exo.return_page_code(None)
    assert 'Missing URL' in str(excinfo.value)
    with pytest.raises(RuntimeError) as excinfo:
        exo.return_page_code("https://www.ruediger-voigt.eu/throw-402.html")
    assert 'Cannot return page code' in str(excinfo.value)
Beispiel #7
0
 def filemaster_labels_by_url(self, url: Union[exo_url.ExoUrl, str]) -> set:
     """Get a list of label names (not id numbers!) attached to a specific
        filemaster entry using the URL associated."""
     if not isinstance(url, exo_url.ExoUrl):
         url = exo_url.ExoUrl(url)
     self.cur.callproc('labels_filemaster_by_url_SP', (url, ))
     labels = self.cur.fetchall()
     return {(label[0])
             for label in labels
             } if labels else set()  # type: ignore[index]
 def get_filemaster_id_by_url(
         self, url: Union[exo_url.ExoUrl, str]) -> Optional[str]:
     "Get the id of the filemaster entry associated with this URL"
     if not isinstance(url, exo_url.ExoUrl):
         url = exo_url.ExoUrl(url)
     self.cur.execute(
         'SELECT id FROM fileMaster ' + 'WHERE urlHash = SHA2(%s,256);',
         (url, ))
     id_in_file_master = self.cur.fetchone()
     return id_in_file_master[
         0] if id_in_file_master else None  # type: ignore[index]
Beispiel #9
0
 def add_save_page_code(self,
                        url: Union[exo_url.ExoUrl, str],
                        labels_master: set = None,
                        labels_version: set = None,
                        prettify_html: bool = False,
                        force_new_version: bool = False) -> Optional[str]:
     "Add an URL to the queue to save its HTML code into the database."
     if not isinstance(url, exo_url.ExoUrl):
         url = exo_url.ExoUrl(url)
     uuid = self.queue.add_to_queue(url, 2, labels_master, labels_version,
                                    prettify_html, force_new_version)
     return uuid
Beispiel #10
0
 def add_file_download(self,
                       url: Union[exo_url.ExoUrl, str],
                       labels_master: set = None,
                       labels_version: set = None,
                       force_new_version: bool = False) -> Optional[str]:
     "Add a file download URL to the queue"
     if not isinstance(url, exo_url.ExoUrl):
         url = exo_url.ExoUrl(url)
     uuid = self.queue.add_to_queue(url, 1, labels_master,
                                    labels_version, False,
                                    force_new_version)
     return uuid
Beispiel #11
0
def test_add_save_page_code():
    # malformed URL: must not be added to the queue
    with pytest.raises(ValueError) as excinfo:
        _ = exo.add_save_page_code('missingschema.example.com')
    assert 'Malformed' in str(excinfo.value)
    # add task
    uuid_1 = exo.add_save_page_code('https://www.ruediger-voigt.eu/')
    exo.delete_from_queue(uuid_1)
    # add task with ExoUrl
    uuid_2 = exo.add_save_page_code(
        exo_url.ExoUrl('https://www.ruediger-voigt.eu/'))
    exo.delete_from_queue(uuid_2)
Beispiel #12
0
 def update_current_url(self, job_name: str,
                        current_url: Union[exo_url.ExoUrl, str]) -> None:
     "Set the currentUrl for a specific job. "
     if not job_name:
         raise ValueError('Provide the job name.')
     if not current_url:
         raise ValueError('Current URL must not be empty.')
     if not isinstance(current_url, exo_url.ExoUrl):
         current_url = exo_url.ExoUrl(current_url)
     # execute returns affected rows, callproc does not
     affected_rows = self.cur.execute(
         'CALL job_update_current_url_SP(%s, %s);', (job_name, current_url))
     if affected_rows == 0:
         raise ValueError('A job with this name is not known.')
Beispiel #13
0
 def add_save_page_text(self,
                        url: Union[exo_url.ExoUrl, str],
                        labels_master: set = None,
                        labels_version: set = None,
                        force_new_version: bool = False) -> Optional[str]:
     """Add the task 'Extract the text (not the code) from a HTML page and
        store it into the database' to the queue.
        This can be useful for some language processing tasks, but compared
        to add_save_page_code this removes the possiblity to work on a
        specific part using a CSS selector."""
     if not isinstance(url, exo_url.ExoUrl):
         url = exo_url.ExoUrl(url)
     uuid = self.queue.add_to_queue(url, 4, labels_master, labels_version,
                                    True, force_new_version)
     return uuid
Beispiel #14
0
def test_add_file_download():
    # unsupported protocol
    with pytest.raises(ValueError) as excinfo:
        _ = exo.add_file_download('ftp://www.ruediger-voigt.eu/')
    assert 'invalid or unsupported' in str(excinfo.value)
    # standard add
    uuid_1 = exo.add_file_download(
        'https://www.ruediger-voigt.eu/examplefile.txt')
    assert uuid_1 is not None, 'File download was not added'
    exo.delete_from_queue(uuid_1)
    # check with exoUrl
    uuid_2 = exo.add_file_download(
        exo_url.ExoUrl('https://www.example.com/exo-url-test.html'))
    assert uuid_2 is not None, 'File download was not added'
    exo.delete_from_queue(uuid_2)
Beispiel #15
0
 def add_page_to_pdf(self,
                     url: Union[exo_url.ExoUrl, str],
                     labels_master: set = None,
                     labels_version: set = None,
                     force_new_version: bool = False) -> Optional[str]:
     "Add an URL to the queue to print it to PDF with headless Chrome. "
     if not isinstance(url, exo_url.ExoUrl):
         url = exo_url.ExoUrl(url)
     if not self.controlled_browser.browser_present:
         logging.warning(
             'Will add this task to the queue, but without Chrome or ' +
             'Chromium it cannot run! Provide the path to the ' +
             'executable when you initialize exoskeleton.')
     uuid = self.queue.add_to_queue(url, 3, labels_master, labels_version,
                                    False, force_new_version)
     return uuid
Beispiel #16
0
    def assign_labels_to_master(self, url: Union[exo_url.ExoUrl, str],
                                labels: set) -> None:
        """ Assigns one or multiple labels to the *fileMaster* entry.
            Removes duplicates and adds new labels to the label list
            if necessary."""
        if not labels:
            return None

        if not isinstance(url, exo_url.ExoUrl):
            url = exo_url.ExoUrl(url)

        # Using a set to avoid duplicates. However, accept either
        # a single string or a list type.
        label_set = userprovided.parameters.convert_to_set(labels)

        for label in label_set:
            # Make sure all labels are in the database table.
            # -> If they already exist or are malformed, the command
            # will be ignored by the DBMS.
            self.define_new_label(label)

        # Get all label-ids
        id_list = self.get_label_ids(label_set)

        # Check whether some labels are already associated
        # with the fileMaster entry.
        self.cur.execute(
            'SELECT labelID ' + 'FROM labelToMaster ' +
            'WHERE urlHash = SHA2(%s,256);', (url, ))
        ids_found: Optional[tuple] = self.cur.fetchall()
        ids_associated = set()
        if ids_found:
            ids_associated = set(ids_found)

        # ignore all labels already associated:
        remaining_ids = tuple(id_list - ids_associated)

        if len(remaining_ids) > 0:
            # Case: there are new labels
            # Convert into a format to INSERT with executemany
            insert_list = [(id, url) for id in remaining_ids]
            # Add those associatons
            self.cur.executemany(
                'INSERT IGNORE INTO labelToMaster ' + '(labelID, urlHash) ' +
                'VALUES (%s, SHA2(%s,256));', insert_list)
        return None
Beispiel #17
0
def test_assign_labels_to_master():
    # Add a task without any filemaster label
    test_url = 'https://www.example.com/assign-label-to-fm.html'
    test_uuid = exo.add_page_to_pdf(test_url)
    # test guard
    exo.labels.assign_labels_to_master(test_url, set()) is None
    exo.labels.assign_labels_to_master(test_url, None) is None
    # Now use the URL to add some labels at filemaster level
    fm_labels = {'assign_to_fm_test_1', 'assign_to_fm_test_2'}
    exo.labels.assign_labels_to_master(test_url, fm_labels)
    # pull the labels and compare them
    assert exo.labels.filemaster_labels_by_url(test_url) == fm_labels
    # repeat with ExoUrl
    assert exo.labels.filemaster_labels_by_url(
        exo_url.ExoUrl(test_url)) == fm_labels
    # clean up
    exo.delete_from_queue(test_uuid)
    test_counter['num_expected_labels'] += 2
def test_exo_url_DUNDERS():
    url_str = 'https://www.example.com'
    myUrl = exo_url.ExoUrl(url_str)
    assert str(myUrl) == url_str
def test_exo_url_GUARDS():
    with pytest.raises(ValueError) as excinfo:
        exo_url.ExoUrl(None)
    assert "Missing URL" in str(excinfo.value)
Beispiel #20
0
def test_log_rate_limit_hit():
    test_url = 'https://www.example.com'
    exo.stats.log_rate_limit_hit(exo_url.ExoUrl(test_url))
    exo.errorhandling.forget_permanent_errors()
Beispiel #21
0
def test_add_page_to_pdf():
    uuid_1 = exo.add_page_to_pdf('https://www.example.com')
    uuid_2 = exo.add_page_to_pdf(exo_url.ExoUrl('https://www.example.com'))
    # clean up
    exo.delete_from_queue(uuid_1)
    exo.delete_from_queue(uuid_2)
Beispiel #22
0
def test_exo_url_generate_sha256_hash(url: str):
    hash_python = exo_url.ExoUrl(url).hash
    exo.cur.execute('SELECT SHA2(%s, 256);', (url, ))
    hash_db = exo.cur.fetchone()[0]
    assert hash_python == hash_db
    def process_queue(self) -> None:
        "Process the queue"
        self.stats.log_queue_stats()

        while True:
            try:
                next_in_queue = self.get_next_task()
            except pymysql.err.OperationalError as op_err:
                if op_err.args[0] == 2013:  # errno
                    # this error is unusual. Give the db some time:
                    logging.error('Lost database connection. ' +
                                  'Trying to restore it in 10 seconds ...')
                    time.sleep(10)
                    try:
                        self.cur = self.db_connection.get_cursor()
                        next_in_queue = self.get_next_task()
                        logging.info('Restored database connection!')
                    except Exception as exc:
                        msg = 'Could not reestablish database connection'
                        logging.exception(msg, exc_info=True)
                        self.notify.send_msg_abort_lost_db()
                        raise ConnectionError(msg) from exc
                else:
                    logging.error('Unexpected Operational Error',
                                  exc_info=True)
                    raise

            if next_in_queue is None:
                # no actionable item in the queue
                if self.stop_if_queue_empty:
                    # Bot is configured to stop if queue is empty
                    # => check if that is only temporary or everything is done

                    if self.stats.num_tasks_w_temporary_errors() > 0:
                        # there are still tasks, but they have to wait
                        logging.debug(
                            "Tasks with temporary errors: " +
                            "waiting %s seconds until next try.",
                            self.queue_revisit)
                        time.sleep(self.queue_revisit)
                        continue

                    # Nothing left (i.e. num_temp_errors == 0)
                    logging.info('Queue empty. Bot stops as configured.')

                    num_permanent_errors = self.stats.num_tasks_w_permanent_errors(
                    )
                    if num_permanent_errors > 0:
                        logging.error("%s permanent errors!",
                                      num_permanent_errors)
                    self.notify.send_msg_finish()
                    break

                logging.debug(
                    "No actionable task: waiting %s seconds until next check",
                    self.queue_revisit)
                time.sleep(self.queue_revisit)
                continue

            # Got a task from the queue!
            queue_id = next_in_queue[0]
            action = next_in_queue[1]
            url = exo_url.ExoUrl(next_in_queue[2])
            prettify_html = (next_in_queue[4] == 1)

            # The FQDN might have been added to the blocklist *after*
            # the task entered into the queue!
            if self.blocklist.check_blocklist(str(url.hostname)):
                logging.error(
                    'Cannot process queue item: FQDN meanwhile on blocklist!')
                self.delete_from_queue(queue_id)
                logging.info('Removed item from queue: FQDN on blocklist.')
            else:
                if action == 1:  # download file to disk
                    self.actions.get_object(queue_id, 'file', url)
                elif action == 2:  # save page code into database
                    self.actions.get_object(queue_id, 'content', url,
                                            prettify_html)
                elif action == 3:  # headless Chrome to create PDF
                    self.actions.get_object(queue_id, 'page_to_pdf', url)
                elif action == 4:  # save page text into database
                    self.actions.get_object(queue_id, 'text', url)
                else:
                    logging.error('Unknown action id!')

                self.notify.send_msg_milestone()

                # wait some interval to avoid overloading the server
                self.time.random_wait()
Beispiel #24
0
def test_log_temporary_problem():
    test_url = 'https://www.example.com'
    exo.stats.log_temporary_problem(exo_url.ExoUrl(test_url))
    exo.errorhandling.forget_temporary_errors()