def test_wrong_year_of_elections(self): with patch("pkwscraper.lib.downloader.BASE_URL_DICT", self.mock_url_dict): with self.assertRaises(ValueError): Downloader("dwa tysiące piętnaście", self.local_directory) with self.assertRaises(KeyError): Downloader(2017, self.local_directory)
def test_connection_error(self): # arrange dl = MagicMock() dl._convert_filename.return_value = self.filename dl._Downloader__base_url = self.mock_url_dict[2015] dl._Downloader__local_directory = self.local_directory mock_exists = MagicMock() mock_exists.return_value = False mock_response = MagicMock() mock_response.status_code = 503 mock_requests = MagicMock() mock_requests.get.return_value = mock_response # act with patch("pkwscraper.lib.downloader.os.path.exists", mock_exists): with patch("pkwscraper.lib.downloader.requests", mock_requests): with patch("pkwscraper.lib.downloader.print"): with self.assertRaises(ConnectionError): Downloader.download(dl, self.relative_url) # assert dl._save_file.assert_not_called()
def test_get_from_cache(self): # arrange dl = Downloader(2015, self.local_directory) dl._save_file(self.fake_content, self.fake_filepath) # act result = dl.download(self.fake_rel_path) # assert self.assertEqual(result, self.fake_content)
def test_wrong_relative_path(self): rel_url = "0/wyniki.csv" dl = MagicMock() with self.assertRaises(ValueError): Downloader.download(dl, rel_url) with self.assertRaises(ValueError): Downloader.download(dl, rel_url, force=True)
def test_download(self): # arrange dl = Downloader(2015, self.local_directory) # act with patch("pkwscraper.lib.downloader.print"): result = dl.download(self.rel_path) # assert self.assertIsInstance(result, bytes) self.assertGreater(len(result), 100) self.assertTrue(os.path.exists(self.filepath))
def test_check_file_extension(self): base = "/0/wyniki." bad_extensions = ["doc", "py", "txt", "dat", "rar"] good_extensions = ["csv", "xls", "xlsx", "htm", "html", "blob", "zip"] for extension in bad_extensions: rel_url = base + extension with self.assertRaises(ValueError): Downloader._convert_filename(rel_url) for extension in good_extensions: rel_url = base + extension Downloader._convert_filename(rel_url)
def __init__(self, db=None): # create downloader object self.dl = Downloader(year=2015, directory=RAW_DATA_DIRECTORY) # open db for rescribing if db is None: db = DbDriver(RESCRIBED_DATA_DIRECTORY) if not isinstance(db, DbDriver): raise TypeError("Please pass an instance of `DbDriver` or `None`.") if db.read_only: raise RuntimeError( "Please pass `DbDriver` for writing or `None`.") self.db = db # for checking self.all_votes = 0
def test_force_redownloading(self): # arrange dl = MagicMock() dl._convert_filename.return_value = self.filename dl._Downloader__base_url = self.mock_url_dict[2015] dl._Downloader__local_directory = self.local_directory mock_exists = MagicMock() mock_exists.return_value = True mock_response = MagicMock() mock_response.status_code = 200 mock_response.content = self.blob_content mock_requests = MagicMock() mock_requests.get.return_value = mock_response # act with patch("pkwscraper.lib.downloader.os.path.exists", mock_exists): with patch("pkwscraper.lib.downloader.requests", mock_requests): with patch("pkwscraper.lib.downloader.print"): file_content = Downloader.download( dl, self.relative_url, force=True) # assert mock_requests.get.assert_called_once_with( self.mock_url_dict[2015]+self.relative_url) dl._save_file.assert_called_once_with(self.blob_content, self.filepath) self.assertEqual(file_content, self.blob_content) mock_exists.assert_not_called() dl._load_file.assert_not_called()
def test_convert_filename(self): # arrange rel_url_1 = "/0/wyniki.bmp" rel_url_2 = "/0/wyniki.xls" rel_url_3 = "/" expected_filename_2 = "0_wyniki.xls" expected_filename_3 = "index.html" # act with self.assertRaises(ValueError): Downloader._convert_filename(rel_url_1) result_2 = Downloader._convert_filename(rel_url_2) result_3 = Downloader._convert_filename(rel_url_3) # assert self.assertEqual(result_2, expected_filename_2) self.assertEqual(result_3, expected_filename_3)
def test_not_existing_directory(self): # arrange mock_os = MagicMock() mock_os.path.isdir.return_value = False local_directory = "/nonexisting/db" # act with patch("pkwscraper.lib.downloader.os", mock_os): with patch("pkwscraper.lib.downloader.print"): dl = Downloader(2015, local_directory) # assert mock_os.path.isdir.assert_called_once_with(local_directory) mock_os.makedirs.assert_called_once_with(local_directory, exist_ok=True)
def test_init(self): # arrange mock_os = MagicMock() mock_os.path.isdir.return_value = True # act with patch("pkwscraper.lib.downloader.os", mock_os): with patch("pkwscraper.lib.downloader.BASE_URL_DICT", self.mock_url_dict): dl = Downloader("2015", self.local_directory) # assert self.assertEqual(dl._Downloader__base_url, "https://www.pkw2015.pl") self.assertEqual(dl._Downloader__local_directory, "/raw_data/db")
def test_return_cached_file(self): # arrange dl = MagicMock() dl._Downloader__local_directory = self.local_directory dl._convert_filename.return_value = self.filepath dl._load_file.return_value = self.blob_content mock_exists = MagicMock() mock_exists.return_value = True # act with patch("pkwscraper.lib.downloader.os.path.exists", mock_exists): file_content = Downloader.download(dl, self.relative_url) # assert dl._convert_filename.assert_called_once_with(self.relative_url) mock_exists.assert_called_once_with(self.filepath) dl._load_file.assert_called_once_with(self.filepath)
def test_force_redownload(self): # arrange dl = Downloader(2015, self.local_directory) dl._save_file(self.fake_content, self.filepath) # check cache result = dl.download(self.rel_path) self.assertEqual(result, self.fake_content) # check redownload with patch("pkwscraper.lib.downloader.print"): result = dl.download(self.rel_path, force=True) self.assertNotEqual(result, self.fake_content) self.assertTrue(os.path.exists(self.filepath))
class Sejm2015Scraper(BaseScraper): def __init__(self, db=None): # create downloader object self.dl = Downloader(year=2015, directory=RAW_DATA_DIRECTORY) # open db for rescribing if db is None: db = DbDriver(RESCRIBED_DATA_DIRECTORY) if not isinstance(db, DbDriver): raise TypeError("Please pass an instance of `DbDriver` or `None`.") if db.read_only: raise RuntimeError( "Please pass `DbDriver` for writing or `None`.") self.db = db # for checking self.all_votes = 0 def run_all(self): # TODO - check, if the data is already downloaded # and omit unnecessary steps self._download_voivodships() self._download_okregi() self._download_committees() self._download_xls_candidates() self._download_html_candidates() self._download_mandates_winners_and_powiaty() self._download_gminy_and_obwody() self._download_voting_results() print("dumping DB tables...") self.db.dump_tables() print("DB closed.") print() def _download_voivodships(self): relative_path = "/index.html" html_content = self.dl.download(relative_path) html_tree = html.fromstring(html_content) xpath_voivodships = '/html/body//div[@id="home"]//' \ 'div[@class="home_content home_mapa"]//svg//a' voivodship_elems = html_tree.xpath(xpath_voivodships) print() print(len(voivodship_elems)) self.db.create_table("województwa") for html_elem in voivodship_elems: href_path = html_elem.attrib['xlink:href'] code = re.findall('\d+', href_path)[0] name = html_elem.getchildren()[0].attrib["rel"] geo = html_elem.getchildren()[0].attrib["d"] self.db["województwa"].put({ "code": code, "name": name, "geo": geo }) print(name, end=", ") print() def _download_okregi(self): relative_path = "/349_Wyniki_Sejm/0/0.html" html_content = self.dl.download(relative_path) html_tree = html.fromstring(html_content) xpath_all_votes = '/html/body/div/div[4]/div[2]/div[3]' \ '/div[2]/div/div[1]/div/div[3]/div[2]/text()' self.all_votes = int(html_tree.xpath(xpath_all_votes)[0]) print() print(f"All votes: {self.all_votes}") xpath_okregi_mapa = '/html/body//div[@id="wyniki1"]//' \ 'div[@id="wyniki1_top_mapa"]//svg//a' okregi_hrefs = html_tree.xpath(xpath_okregi_mapa) xpath_okregi_table = '/html/body//div[@id="wyniki1"]//' \ 'div[@id="wyniki1_tabela_suma"]//table/tbody/tr' okregi_table_rows = html_tree.xpath(xpath_okregi_table) assert len(okregi_hrefs) == len(okregi_table_rows), \ f"invalid number of constituencies: {len(okregi_hrefs)}, {len(okregi_table_rows)}" print() print(len(okregi_table_rows)) self.db.create_table("okręgi") for html_map_elem, html_table_row_elem in \ zip(okregi_hrefs, okregi_table_rows): href_path = html_map_elem.attrib['xlink:href'] path_elem = html_map_elem.getchildren()[0] name = path_elem.attrib["rel"] number = re.findall('\d+', name)[0] geo = path_elem.attrib["d"] cell_elems = html_table_row_elem.getchildren() number_elem = cell_elems[0].getchildren()[0] href_path_2 = number_elem.attrib['href'] number_2 = number_elem.text voivodship = cell_elems[1].getchildren()[0].text headquarters = cell_elems[2].getchildren()[0].text mandates = cell_elems[3].getchildren()[0].text assert href_path == href_path_2, f"invalid href: {href_path}, {href_path_2}" assert number == number_2, f"invalid number: {number}, {number_2}" self.db["okręgi"].put({ "number": number, "headquarters": headquarters, "voivodship": voivodship, "mandates": mandates, "geo": geo }) print(name, end=", ") print() def _download_committees(self): relative_path = "/komitety.html" html_content = self.dl.download(relative_path) html_tree = html.fromstring(html_content) xpath_committees = '/html/body//div[@id="komitety"]//' \ 'table/tbody/tr' committees_elems = html_tree.xpath(xpath_committees) print() print(len(committees_elems)) self.db.create_table("komitety") for html_elem in committees_elems: value_elems = [child.getchildren()[0] for child in html_elem.getchildren()] href_path = value_elems[0].attrib['href'] number = value_elems[0].text signature = value_elems[1].text type_ = value_elems[2].text name = value_elems[3].text shortname = value_elems[4].text sejm_candidates = value_elems[5].text senat_candidates = value_elems[6].text status = value_elems[7].text self.db["komitety"].put({ "number": number, "signature": signature, "type": type_, "name": name, "shortname": shortname, "sejm_candidates": sejm_candidates, "senat_candidates": senat_candidates, "status": status }) print(name, end=", ") print() def _download_xls_candidates(self): # download xls data self.dl.download("/kandydaci.zip") with ZipFile(RAW_DATA_DIRECTORY + "/kandydaci.zip") as zf: zf.extractall(RAW_DATA_DIRECTORY) # open xls book = xlrd.open_workbook( RAW_DATA_DIRECTORY + "/kandsejm2015-10-19-10-00.xls") sheet = book.sheet_by_index(0) self.db.create_table("kandydaci_xls") # iterate over candidates for row_index in range(1, sheet.nrows): row = sheet.row(row_index) okreg_number = row[0].value list_number = row[1].value committee_name = row[2].value position = row[3].value surname = row[4].value names = row[5].value gender = row[6].value residence = row[7].value occupation = row[8].value party = row[9].value self.db["kandydaci_xls"].put({ "okreg_number": int(okreg_number), "list_number": int(list_number), "committee_name": committee_name, "position": int(position), "surname": surname, "names": names, "gender": gender, "residence": residence, "occupation": occupation, "party": party, }) n_candidates = sheet.nrows - 1 n_candidates_2 = len(self.db['kandydaci_xls'].find({})) assert n_candidates == n_candidates_2, \ f"invalid candidates number: {n_candidates}, {n_candidates_2}" print() print(f"Found {n_candidates} candidates.") print() def _download_html_candidates(self): # enumerate constituencies relative_url_template = "/349_Wyniki_Sejm/0/0/{}.html" okregi = self.db["okręgi"].find({}, fields="number") self.db.create_table("kandydaci_html") # iterate over constituencies for okreg_number in okregi: # find candidates from constituency page relative_url = relative_url_template.format(okreg_number) html_content = self.dl.download(relative_url) html_tree = html.fromstring(html_content) xpath_candidates = '/html/body//div[@id="tresc"]//' \ 'div[@id="wyniki1_tabela_frek"][2]//tbody/tr' candidates_elements = html_tree.xpath(xpath_candidates) # save records for row_elem in candidates_elements: cells = row_elem.getchildren() committee_name = cells[1].text_content() full_name = list(cells[3].itertext())[1] class_name = row_elem.get("class") crossed_out = ( class_name is not None and "skreslony" in class_name ) if crossed_out: print(f"Crossed out candidate: {full_name}," f" constituency: {okreg_number}, committee:" f" {committee_name}.") self.db["kandydaci_html"].put({ "okreg_number": int(okreg_number), "committee_shortname": committee_name, "full_name": full_name, "crossed_out": crossed_out }) def _download_mandates_winners_and_powiaty(self): relative_url_template = "/349_Wyniki_Sejm/0/0/{}.html" okregi = self.db["okręgi"].find({}, fields="number") self.db.create_table("mandaty") self.db.create_table("powiaty") print() for okreg_number in okregi: relative_url = relative_url_template.format(okreg_number) html_content = self.dl.download(relative_url) html_tree = html.fromstring(html_content) xpath_powiaty = '/html/body//div[@id="tresc"]//' \ 'div[@id="wyniki1_top_mapa"]//svg//a' powiaty = html_tree.xpath(xpath_powiaty) for powiat_elem in powiaty: href_path = powiat_elem.attrib['xlink:href'] code = re.findall('\d+', href_path)[1] name = powiat_elem.getchildren()[0].attrib["rel"] geo = powiat_elem.getchildren()[0].attrib["d"] self.db["powiaty"].put({ "constituency_number": okreg_number, "code": code, "name": name, "geo": geo }) xpath_winners = '/html/body//div[@id="wyniki1_tabela_frek"][1]//' \ 'table/tbody/tr' winners_elems = html_tree.xpath(xpath_winners) for row_elem in winners_elems: cells = row_elem.getchildren() constituency_number = list(cells[0].itertext())[1] list_number = cells[1].text_content() position = cells[2].text_content() committee_name = cells[3].text_content() full_name = list(cells[4].itertext())[1] assert okreg_number == constituency_number, \ f"invalid constituency number: {okreg_number}, {constituency_number}" self.db["mandaty"].put({ "constituency_number": constituency_number, "list_number": list_number, "position": position, "committee_name": committee_name, "full_name": full_name }) print(full_name, end=", ") print() n_powiaty = len(self.db["powiaty"].find({})) n_mandates = len(self.db["mandaty"].find({})) print(f"Found {n_powiaty} districts.") print(f"{n_mandates} of mandates were given.") def _download_gminy_and_obwody(self): # get html content for all districts (powiaty) powiaty_codes = self.db["powiaty"].find({}, fields="code") relative_url_template = "/349_Wyniki_Sejm/0/1/0/{}.html" xpath_gminy = '/html/body//div[@id="tresc"]//' \ 'div[@id="wyniki1_top_mapa"]//svg//a' self.db.create_table("gminy") self.db.create_table("obwody") # extract communes data for each district for powiat_code in powiaty_codes: relative_url = relative_url_template.format(powiat_code) html_content = self.dl.download(relative_url) html_tree = html.fromstring(html_content) gminy = html_tree.xpath(xpath_gminy) for gmina_elem in gminy: # get values of attributes href_path = gmina_elem.attrib['xlink:href'] code = re.findall('\d+', href_path)[0] partial_name = gmina_elem.getchildren()[0].attrib["rel"] geo = gmina_elem.getchildren()[0].attrib["d"] # Łódź and Cracow cities are divided into city districts # but they have no polling districts assigned to them... # Skip them. # TODO - MAYBE IT WOULD BE BETTER TO GET RID OF THESE # RECORDS IN THE PREPROCESSING STEP... skip_condition = ( partial_name.startswith("Łódź") or partial_name.startswith("Kraków") ) skip_condition = skip_condition and code[-2:] != "01" if skip_condition: continue # put record in DB self.db["gminy"].put({ "code": code, "partial_name": partial_name, "geo": geo }) # NOTE - `name`, `rural_or_urban` will be taken from # polling districts data in preprocessing step # extract polling districts information from xlsx self.dl.download("/wyniki_zb/2015-gl-lis-obw.zip") with ZipFile( RAW_DATA_DIRECTORY + "wyniki_zb_2015-gl-lis-obw.zip") as zf: zf.extractall(RAW_DATA_DIRECTORY) all_votes = 0 book = xlrd.open_workbook( RAW_DATA_DIRECTORY + "/2015-gl-lis-obw.xls") sheet = book.sheet_by_index(0) for row_index in range(1, sheet.nrows): row = sheet.row(row_index) all_votes += int(row[27].value) self.db["obwody"].put({ "constituency_number": int(row[0].value), "senate_constituency_number": int(row[1].value), "commune_code": row[2].value, "commune_name": row[3].value, "polling_district_number": int(row[4].value), "full_address": row[5].value, "voters": int(row[6].value), "got_ballots": int(row[7].value), "unused_ballots": int(row[8].value), "given_ballots": int(row[9].value), "proxy_voters": int(row[10].value), "certificate_voters": int(row[11].value), "voting_packets": int(row[12].value), "return_envelopes": int(row[13].value), "envelopes_without_statement": int(row[14].value), "unsigned_statement": int(row[15].value), "without_voting_envelope": int(row[16].value), "unseeled_voting_envelopes": int(row[17].value), "envelopes_accepted": int(row[18].value), "ballots_from_box": int(row[19].value), "envelopes_from_ballot_box": int(row[20].value), "ballots_invalid": int(row[21].value), "ballots_valid": int(row[22].value), "votes_invalid": int(row[23].value), "invalid_2_candidates": int(row[24].value), "invalid_no_vote": int(row[25].value), "invalid_candidate": int(row[26].value), "votes_valid": int(row[27].value) }) if not all_votes == self.all_votes: raise RuntimeError( "Incompatible votes sum: {all_votes} vs {self.all_votes}.") print() n_gminy = len(self.db["gminy"].find({})) print(f"Found {n_gminy} communes.") print() n_obwody = len(self.db["obwody"].find({})) print(f"Found {n_obwody} voting constituencies.") def _download_voting_results(self): """ Read votes from 41 xlsx files """ print() # create extension of polling districts data self.db.create_table("obwody_uzupełnienie") all_votes = 0 # iterate over constituencies table_name_template = "wyniki_{}" constituencies = self.db["okręgi"].find({}, fields="number") for constituency_number in constituencies: # create table with results for a constituency constituency_number = int(constituency_number) table_name = table_name_template.format(constituency_number) self.db.create_table(table_name) # read data relative_url = f"/wyniki_okr_sejm/{constituency_number:02d}.xlsx" self.dl.download(relative_url) filename = relative_url.replace("/", "_").strip("_") book = load_workbook(RAW_DATA_DIRECTORY + "/" + filename, read_only=True) sheet = book.active # read the first row with lists and candidates names last_cell_of_protocole_columns = sheet.cell(1, 27).value assert last_cell_of_protocole_columns == "Sejm - Liczba głosów ważnych oddanych łącznie na wszystkie listy kandydatów", \ f"wrong columns alignment: {last_protocole_column_value}" current_list = None candidates = [] # iterate over cells for column_index in range(27, sheet.max_column): value = sheet.cell(1, column_index+1).value # beginning of new list if not current_list: current_list = value candidates.append((current_list, "sum")) continue # end of current list if value == f"Razem {current_list}": candidates.append((current_list, "sum")) current_list = None continue # get candidate candidate_name = value candidates.append((current_list, candidate_name)) # check correctness if current_list is not None: raise ValueError("Did not found end of list of candidates.") # iterate over polling districts (1 row - 1 polling district) print(f"Iterating over polling districts in constituency no. {constituency_number}...") for row_index, row in enumerate(sheet.iter_rows()): polling_district_votes = 0 if row_index == 0: continue if row_index % 100 == 0: print(f"{row_index} of {sheet.max_row-1}", end=", ", flush=True) # get additional polling district data commune_name = row[1].value commune_code = row[2].value commune_code = re.findall('\d+', commune_code)[0] commission_name = row[3].value polling_district_number = row[4].value # put it into table self.db["obwody_uzupełnienie"].put({ "commune_name": commune_name, "commune_code": commune_code, "commission_name": commission_name, "polling_district_number": polling_district_number }) # prepare list of cells corresponding with candidates candidates_cell_range = list(range(27, sheet.max_column)) if len(candidates_cell_range) != len(candidates): raise ValueError( f"Different lenght of candidates lists: " f"{len(candidates)}/{len(candidates_cell_range)}" ) # prepare polling district entry record = { "commune_code": commune_code, "polling_district_number": polling_district_number, "candidates_count": 0, } # iterate over candidates in the given polling district for candidate, column_index in zip( candidates, candidates_cell_range): committee_name, candidate_name = candidate votes = row[column_index].value if candidate_name == "sum": continue try: all_votes += int(votes) except ValueError: if votes != "XXXXX": raise # add field to the record assert "/" not in committee_name + candidate_name, \ "`{committee_name + candidate_name}` has slash" candidate_identifier = f"{committee_name}/{candidate_name}" record[candidate_identifier] = votes record["candidates_count"] += 1 # add record to table self.db[table_name].put(record) print() print(f"Finished constituency no. {constituency_number}.") print() if not all_votes == self.all_votes: print(f"Incompatible votes sum: {all_votes} vs {self.all_votes}." f" This comes from miscounting few crossed out candidates.") print()
def test_download_wrong_file(self): dl = Downloader(2015, self.local_directory) with self.assertRaises(ConnectionError): with patch("pkwscraper.lib.downloader.print"): result = dl.download(self.fake_rel_path)
def test_init(self): dl = Downloader(2015, self.local_directory) self.assertEqual(dl._Downloader__base_url, BASE_URL_DICT[2015]) self.assertEqual(dl._Downloader__local_directory, self.local_directory)