def test_auto_increment(self): with session_scope(self.dbms) as session: vacancy_list = [ Vacancies(platform=self.platform.name, date=datetime(2019, 1, 2), url="http://", title="title", company="aha", search_topic='Java', location="Wien"), Vacancies(platform=self.platform.name, date=datetime(2019, 1, 3), url="http://", title="title", company="aha", search_topic='Java', location="Wien") ] for v in vacancy_list: session.add(v) session.flush() with session_scope(self.dbms) as session: vacancies_instances_list = session.query(Vacancies).all() id_list = [entry.id for entry in vacancies_instances_list] self.assertEqual(id_list, [1, 2]) title_list = [entry.title for entry in vacancies_instances_list] self.assertEqual(title_list, ['title', 'title'])
def load_data_scripture(dbms, platform_registry): platform_json_file = os.path.join(ConfigHandler.ROOT_DIR, "tests/test_data/platform.json") vacancies_json_file = os.path.join(ConfigHandler.ROOT_DIR, "tests/test_data/vacancies.json") with session_scope(dbms=dbms) as session: with open(platform_json_file, 'r') as json_file: platform_file = json.loads(json_file.read()) for row in platform_file: session.add( Platform(name=row['name'], base_address=row['base_address']) ) platform_registry.register_new_platform(KarriereATHandler) platform_registry.register_new_platform(StepStoneHandler) session.commit() session.flush() with open(vacancies_json_file, 'r') as json_file: platform_file = json.loads(json_file.read()) for row in platform_file: date = datetime.strptime(row['date'], '%Y-%m-%d') session.add( Vacancies(platform=row['platform'], search_topic=row['search_type'], date=date, url=row['url'], company=row['company'], title=row['title'], location="") )
def test_database_creation(self): if os.path.isfile(TEST_DB_PATH): os.remove(TEST_DB_PATH) self.assertEqual(os.path.isfile(TEST_DB_PATH), False) dbms = DBHandler(DBHandler.SQLITE, db_name=TEST_DB_NAME) dbms.create_database_and_tables() # Test if DB was created self.assertEqual(os.path.isfile(TEST_DB_PATH), True) # Test that all tables were created result = dbms.get_tables_in_database() self.assertEqual(result, [{"name": "platform"}, {"name": "vacancies"}]) dbms.load_initial_data() with session_scope(dbms) as session: platform_data = session.query(Platform).all() self.assertEqual(len(platform_data), 1) platform_data_row = platform_data[0] self.assertEqual( [platform_data_row.name, platform_data_row.base_address], ['karriere.at', 'www.karriere.at/']) os.remove(TEST_DB_PATH) self.assertEqual(os.path.isfile(TEST_DB_PATH), False)
def _save_vacancy_entries_to_database(self, vacancy_entries: list, search_topic: str) -> bool: with session_scope(self.dbms) as session: try: for entry in vacancy_entries: # Check if the entry already exists in the database exists_check = session.query(Vacancies.id).filter( Vacancies.platform == entry['platform'], Vacancies.company == entry['company'], Vacancies.url == entry['url'], Vacancies.title == entry['title'], Vacancies.search_topic == search_topic, Vacancies.date == entry['date'], Vacancies.location == entry['location']).scalar() is not None # If it does not exist, add it if not exists_check: new_vacancy = Vacancies(platform=entry['platform'], company=entry['company'], url=entry['url'], title=entry['title'], search_topic=search_topic, date=entry['date'], location=entry['location']) session.add(new_vacancy) except Exception as e: print( f"{self.header}: ERROR: Could not insert new entries! Msg.: {str(e)}" ) raise return True
def test_insert_row_in_articles(self): with session_scope(self.dbms) as session: new_vacancy = Vacancies(platform=self.platform.name, date=datetime(2019, 1, 1), url="http://", title="title", company="aha", search_topic='Java', location="Wien") session.add(new_vacancy) session.flush() with session_scope(self.dbms) as session: vacancies_instances_list = session.query(Vacancies).all() self.assertEqual(len(vacancies_instances_list), 1) vacancies_instance = vacancies_instances_list[0] self.assertEqual( [vacancies_instance.id, vacancies_instance.platform], [1, 'test.at'])
def test_auto_delete_postings_after_x_days(self): # Reset Value to 14 (might be None due to other tests) ConfigHandler.POSTING_RETENTION_IN_DAYS = 30 today = datetime.now() very_old_post_date = today - timedelta( days=ConfigHandler.POSTING_RETENTION_IN_DAYS) with session_scope(self.dbms) as session: vacancy_list = [ Vacancies(platform=self.platform.name, date=today, url="http://", title="title", company="aha", search_topic='Java', location="Wien"), Vacancies(platform=self.platform.name, date=very_old_post_date, url="http://", title="title", company="aha", search_topic='Java', location="Wien") ] for v in vacancy_list: session.add(v) self.dbms.cleanup_job_postings_in_database() with session_scope(self.dbms) as session: vacancies_instances_list = session.query(Vacancies).all() # Only one should be left self.assertEqual(len(vacancies_instances_list), 1) # Only the new posting should be left self.assertEqual(vacancies_instances_list[0].date, today.date())
def test_inserting_an_already_existing_vacancy(self): """ Test that a newly scraped entry is skipped when it already exists in the database """ with session_scope(dbms=self.dbms) as session: session.add(Vacancies(**self.vacancy_entries[0])) with session_scope(dbms=self.dbms) as session: vacancy_entries_query_set = session.query(Vacancies).all() # Check that only 1 row exists query_set_len = len(vacancy_entries_query_set) self.assertEqual(query_set_len, 1) self.karriere_at_handler._save_vacancy_entries_to_database(vacancy_entries=self.vacancy_entries, search_topic=self.search_topic) with session_scope(dbms=self.dbms) as session: vacancy_entries_query_set = session.query(Vacancies).all() query_set_len = len(vacancy_entries_query_set) # Count() should stay 1 self.assertEqual(query_set_len, 1)
def test_create_platform_entries_in_database(self): """ Test if the function creates a database entry for each registered platform """ self.platform_registry.create_platform_entries_in_database() with session_scope(self.dbms) as session: platform_query_set = session.query(Platform).all() query_set_len = len(platform_query_set) self.assertEqual(query_set_len, 2) name_list = [entry.name for entry in platform_query_set] self.assertEqual( name_list, list(self.platform_registry.registered_platforms.keys()))
def tearDownClass(cls): try: # Try to close the browser after all tests have run cls.browser_handler.close_browser() except: pass # Reset Database, delete all platforms dbms = DBHandler(DBHandler.SQLITE, db_name=TEST_DB_NAME) with session_scope(dbms) as session: session.query(Platform).delete() super().tearDownClass()
def test_saving_a_vacancy_list_to_db(self): """ Test that the function _save_vacancy_entries_to_database correctly saves the list to db. """ self.karriere_at_handler._save_vacancy_entries_to_database(vacancy_entries=self.vacancy_entries, search_topic=self.search_topic) with session_scope(dbms=self.dbms) as session: vacancy_entries_query_set = session.query(Vacancies).all() query_set_len = len(vacancy_entries_query_set) self.assertEqual(query_set_len, 1) columns = [m.key for m in Vacancies.__table__.columns] result_dict = {} # Create a dictionary from the result-query-set (except for the id-column) for column in columns: if column == 'id': continue result_dict[column] = getattr(vacancy_entries_query_set[0], column) # Compare the result-dictionary with the originally send dictionary self.assertEqual(result_dict, self.vacancy_entries[0])
def tearDown(self): with session_scope(self.dbms) as session: session.query(Vacancies).delete()
def tearDownClass(cls): dbms = DBHandler(DBHandler.SQLITE, db_name=TEST_DB_NAME) with session_scope(dbms) as session: session.query(Platform).delete()
def test_disabled_retention_in_days(self): """Test if value of "disabled" disables auto-deletion""" # Reset Value to 14 (might be None due to other tests) ConfigHandler.POSTING_RETENTION_IN_DAYS = 30 today = datetime.now() very_old_post_date = today - timedelta( days=ConfigHandler.POSTING_RETENTION_IN_DAYS) ConfigHandler.CONFIG_PATH = os.path.join( ConfigHandler.ROOT_DIR, 'tests', 'test_data', 'config_jsons', 'config_disabled_retention_days.json') ConfigHandler.validate_config_file_base_variables() browser_handler = BrowserHandler() browser = browser_handler.get_browser() platform_registry = PlatformRegistry(browser=browser, dbms=self.dbms) platform_registry.register_new_platform(KarriereATHandler) platform_registry.create_platform_entries_in_database() ConfigHandler.validate_search_topics( platform_registry=platform_registry) browser_handler.close_browser() # Check if Posting-retention-in-days was correctly set to None self.assertEqual(ConfigHandler.POSTING_RETENTION_IN_DAYS, None) with session_scope(self.dbms) as session: vacancy_list = [ Vacancies(platform=self.platform.name, date=today, url="http://", title="title", company="aha", search_topic='Java', location="Wien"), Vacancies(platform=self.platform.name, date=very_old_post_date, url="http://", title="title", company="aha", search_topic='Java', location="Wien") ] for v in vacancy_list: session.add(v) self.dbms.cleanup_job_postings_in_database() with session_scope(self.dbms) as session: vacancies_instances_list = session.query(Vacancies).all() # Auto-Deletion should be disabled - both entries should be present self.assertEqual(len(vacancies_instances_list), 2) # Auto-Deletion should be disabled - both entries should be present date_list = [row.date for row in vacancies_instances_list] self.assertEqual( date_list, [today.date(), very_old_post_date.date()])
def print_result_to_html(self, seach_topic_list: list, open_html_after_finish: bool = True): # Copy the template html-file and use it as new result-html-file shutil.copy(src=ConfigHandler.TEMPLATE_HTML_PATH, dst=ConfigHandler.RESULT_HTML_PATH) with open(ConfigHandler.RESULT_HTML_PATH, 'a') as html_file: result_counter = 0 with session_scope(dbms=self.dbms) as session: # Get all platform names platform_names = [] result_rows = session.query(Platform.name).all() for row in result_rows: platform_names.extend(list(row)) # Print all platform names in the header, hyper-linking to the respective sections if self.verbose: print(f"{self.header}: Printing entries for platforms {str(platform_names)}") html_file.write(f'<div class="jump_to_section"><h6>Jump to section ... </h6><ul>\n') for search_topic in seach_topic_list: # html_file.write(f'<h6>Search Type: {search_topic}</h6>\n<ul>') for platform in platform_names: html_file.write(f'<li><a href="#{search_topic}_{platform}" class="contents_a">' f'... Search-Topic {search_topic} - {platform}</a></li>') html_file.write(f'</ul></div><div class="header_clear_area"></div>\n') # Print main body with all entries for search_topic in seach_topic_list: html_file.write(f"<h5>Search-Topic '{search_topic}'</h5>\n") for platform in platform_names: platform_instance = self.platform_registry.get_platform_instance(platform) if search_topic not in platform_instance.scrape_status: # Each platform could have its own search topic. Only proceed if this search topic # was applied to this platform_instance continue html_file.write(f'<h6 id="{search_topic}_{platform}">{platform} - Job postings</h6>') html_file.write('<div class="posting-list-wrapper">') if not platform_instance.scrape_status[search_topic]: # If a negative scrape status -> print error-message and jump to next platform html_file.write('<p class="error_message message">An error occurred when trying to ' 'scrape entries from the platform {platform}</p><br></div>') continue # Get all entries in database for this platform result_rows = session.query(Vacancies)\ .filter(Vacancies.platform == platform, Vacancies.search_topic == search_topic).all() if self.verbose: print(result_rows) if len(result_rows) == 0: html_file.write('<p class="message">No job postings found</p>') for row in result_rows: result_counter += 1 job_item_string = f'<div class="job-item" >' \ f'<a class="job-posting" href="{row.url}" target="_blank" ' \ f'onclick="activateCheckBox(\'job_checkbox_{result_counter}\', ' \ f'\'job_checkbox_label_{result_counter}\', \'outlined\', \'filled\');' \ f'event.stopPropagation();">' \ f'<div class="job-title-column"><div class="arrow-icon"></div>' \ f'<div class="job-title">{row.title}</div>' \ f'</div><div class="job-column">' \ f'<div class="job-company">{row.company}' \ f'</div>' \ f'<div class="job-date">{row.date} - {row.location}</div>' \ f'</div>' \ f'</a>' \ f'<div class="job-checkbox">' \ f'<label for="job_checkbox_{result_counter}" ' \ f'id="job_checkbox_label_{result_counter}" class="checkbox_btn outlined">Checked' \ f'<input type="checkbox" style="opacity: 0;"' \ f'onclick="toggleCheckBox(\'job_checkbox_{result_counter}\', ' \ f'\'job_checkbox_label_{result_counter}\', \'outlined\', \'filled\');' \ f' "' \ f'id="job_checkbox_{result_counter}" class="badgebox">' \ f'<span class="badge"> ✓ </span></label>' \ f'</div>' \ f'</div>' html_file.write(job_item_string) html_file.write('</div><br><br>') html_file.write('</body></html>\n') if open_html_after_finish: os.system(ConfigHandler.RESULT_HTML_FILE_NAME)