def wf13_match_contribution_to_PDF_file(): legislature = ask_for_wahlperiode() dir_loc = f'./parli_data/wf11_sessions/WP{legislature}/' url_base = 'https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument?Id=' if os.path.isdir(dir_loc): print(dir_loc) wp = _open_dilled_wp(legislature) print(f'{wp.number_of_MdLs} parliamentarians in legislature {wp.wahlperiode}.') for _, mdl in wp.MdLs.items(): contributions = copy.deepcopy(mdl.contributions) #print(mdl.key) for protocol_nr, contribution in contributions.items(): _match_to_file(wp, mdl, protocol_nr, contribution) #for _, mdl in wp.MdLs.items(): # print(mdl.key) # for protocol_nr, contribution in mdl.contributions.items(): # print(protocol_nr) # print(contribution) # print() dir_loc = f'./parli_data/wf13_contributions/' os.makedirs(dir_loc, exist_ok=True) file_loc = dir_loc + 'WP_{}.dill'.format(legislature) with open(file_loc, 'wb') as fout: dill.dump(wp, fout)
def wf11_download_session_pdfs(): ''' ''' wahlperiode = ask_for_wahlperiode() dir_loc = './parli_data/' dir_loc = dir_loc + 'wf11_sessions/WP{}/'.format(wahlperiode) if not os.path.isdir(dir_loc): os.makedirs(dir_loc) urls = _create_session_urls(wahlperiode) url_base = 'https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument?Id=' for url in urls: url_name = url.split('?Id=')[-1] + '.pdf' file_loc = dir_loc + url_name if os.path.isfile(file_loc): print('.', end='') pass else: url = url_base + url.split('Id=')[-1] response = _get_response(url) print('*', end='') with open(file_loc, 'wb') as fout: fout.write(response.content) print() return None
def _open_dilled_wp(): ''' ''' wahlperiode = ask_for_wahlperiode() dir_loc = './parli_data/wf09_dilled_wps/' file_loc = dir_loc + 'WP_{}.dill'.format(wahlperiode) with open(file_loc, 'rb') as fin: wp = dill.load(fin) return wp
def wf02_extract_wiki(legislature=None): """ """ if not legislature: legislature = ask_for_wahlperiode() bsObj = _get_bsObj(legislature) mdls = _collect_mdls(bsObj, legislature) return mdls
def wf03_mk_top_container_wahlperiode(wahlperiode=None): """ Creates a class Wahlperiode container that has all the informations that could be extracted from the bsObj. Create a key from these informations that will be unique for each parlamentarian. Key looks like this: lastname_firstname_electoralward_legislature If there is no electoral ward available, "ew" will placehold instead. Returns wp """ if not wahlperiode: wahlperiode = ask_for_wahlperiode() wp = Wahlperiode(int(wahlperiode)) mdls = wf02_extract_all_infos_about_MdLs(wahlperiode) total = len(mdls) counter = 0 for _, mdl_ in mdls.items(): key_ = mdl_.key key = '{}_{}_{}_{}'.format(mdl_.last_name, mdl_.first_name,\ mdl_.electoral_ward, mdl_.legislature) if key != key_: raise Exception('wf03 needs attention') if key not in wp.MdLs: wp.MdLs[key] = mdl_ counter += 1 else: mdl = wp.MdLs[key] mdl = _append_to_dict_entry(key, mdl, mdl_) wp.MdLs[key] = mdl total -= 1 print('total', total) print('{} of {} MdLs'.format(counter, total)) if counter != total: raise Exception( "The number of MdLs and the number of names are not equal.") wp.number_of_MdLs = counter names = list(set(wp.names)) names = _bubblesort(names) wp.names = names print('Concludes wf03 by returning wp.names') return wp
def _open_dilled_wp(): ''' ''' legislature = ask_for_wahlperiode() try: dir_ = f'/home/sam/projects/vEnvs/parli_NRW/parli_NRW/data/WP{legislature}/' print(os.listdir(dir_)) latest_file = dir_ + sorted(os.listdir(dir_))[-1] print(f'opening: {latest_file}') with open(latest_file, 'rb') as fin: wp = dill.load(fin) except (FileNotFoundError, IsADirectoryError): try: print(f'did not find {latest_file}') dir_local = f'./parli_data/wf15_dilled_wps/' file_local = dir_local + 'WP_{}.dill'.format(legislature) print(f'opening file: {file_local}') with open(file_local, 'rb') as fin: wp = dill.load(fin) except FileNotFoundError: dir_loc = f'./parli_data/wf13_contributions/' file_loc = dir_loc + 'WP_{}.dill'.format(legislature) print(f'opening file: {file_loc}') with open(file_loc, 'rb') as fin: wp = dill.load(fin) except IndexError: try: print(f'did not find a file in {dir_}') dir_local = f'./parli_data/wf15_dilled_wps/' file_local = dir_local + 'WP_{}.dill'.format(legislature) print(f'opening file: {file_local}') with open(file_local, 'rb') as fin: wp = dill.load(fin) except FileNotFoundError: dir_loc = f'./parli_data/wf13_contributions/' file_loc = dir_loc + 'WP_{}.dill'.format(legislature) print(f'opening file: {file_loc}') with open(file_loc, 'rb') as fin: wp = dill.load(fin) return wp
def wf01_save_wiki_bsObj(): """ Gets source code of the site https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Landtages_Nordrhein-Westfalen_(17._Wahlperiode) at the Wikipedia with all the parlamentarians (MdLs) of a given "Wahlperiode" (legislature). The url and the legislature will be asked for. Currently there are Wahlperiode 10 to 17 available. Returns nothing, but saves the bsObj as a file for further use. Saves: wikiListe_WP10.soup - wikiListe_WP17.soup Returns: True, if bsObj is downloaded and saved, otherwise False """ url_with_all_MdLs = 'https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Landtages_Nordrhein-Westfalen_({}._Wahlperiode)' wahlperiode = ask_for_wahlperiode() url = url_with_all_MdLs.format(wahlperiode) print(url) dir_loc = "./parli_data/wf01_soup_objects/" file_loc = dir_loc + "wikiListe_WP{}.soup".format(wahlperiode) if os.path.isdir(dir_loc): if os.path.exists(file_loc): check = input("File exists, overwrite? y/n") if check == "y": if _download_and_save_bsObj(url, file_loc, wahlperiode): return True else: if _download_and_save_bsObj(url, file_loc, wahlperiode): return True else: os.mkdir(dir_loc) if _download_and_save_bsObj(url, file_loc, wahlperiode): return True return False
def wf01_save_bsObj(): """ Gets source code of the site "https://www.landtag.nrw.de/portal/WWW/Webmaster/GB_II/II.2/Suche/Landtagsdokumentation_ALWP/Initiativen_Reden_von_Abgeordneten.jsp" at the Landtag NRW with all the parlamentarians (MdLs) of a given "Wahlperiode". The url and the Wahlperiode will be asked for. Currently there are Wahlperiode 10 to 17 available. Returns nothing, but saves the bsObj as a file for further use. Saves: namenListe_WP10.soup - namenListe_WP17.soup Returns: True, if bsObj is downloaded and saved, otherwise False """ url_with_all_MdLs = "https://www.landtag.nrw.de/portal/WWW/Webmaster/GB_II/II.2/Suche/Landtagsdokumentation_ALWP/Initiativen_Reden_von_Abgeordneten.jsp?beg=ges&umfang=redner&wp={}" wahlperiode = ask_for_wahlperiode() url = url_with_all_MdLs.format(wahlperiode) print(url) dir_loc = "./parli_data/wf01_soup_objects/" file_loc = dir_loc + "namenListe_WP{}.soup".format(wahlperiode) if os.path.isdir(dir_loc): if os.path.exists(file_loc): check = input("File exists, overwrite? y/n") if check == "y": if _download_and_save_bsObj(url, file_loc, wahlperiode): return True else: if _download_and_save_bsObj(url, file_loc, wahlperiode): return True else: os.mkdir(dir_loc) if _download_and_save_bsObj(url, file_loc, wahlperiode): return True return False
def wf02_extract_all_infos_about_MdLs(legislature=None, verbose=False): """ Extracting the bsObj for all available infos about MdLs and legislature. Necessary to create a key and a dict later. Informations: last name first name electoral ward party office peer title academic title parliament president legislature If there is no first name available, "fn" will placehold instead. Yields a named tuple with all variables called 'mdl' with name of elctoral ward or 'ew' if none, also with first name or 'fn' if none. Otherwise as defined in wf00_base_classes, class "MdL", either an empty list or "False". Additionally yields legislature and line that contains those information for later checking. Returns dict "mdls" """ if not legislature: leg = ask_for_wahlperiode() # wahlperiode: legislature else: leg = legislature wp = Wahlperiode(int(leg)) bsObj = _get_bsObj(leg) mdls = _collect_mdls(bsObj, wp, leg, verbose) print( 'Concludes wf02 with returning mdls (a dictionary with all MdLs of a legislature).' ) return mdls
return first_name else: try: if party.upper() == mdl_wiki[-1].upper(): first_name = mdl_wiki[0].split(',')[-1] first_name = first_name.upper().strip() return first_name except TypeError: print('party', party) print('mdl_wiki', mdl_wiki) raise Exception except AttributeError: print('last_name, first_name', last_name, first_name) print(line) print('party', party) print('mdl_wiki', mdl_wiki) raise Exception return first_name if __name__ == "__main__": wahlperiode = ask_for_wahlperiode() verbose = False mdls = wf02_extract_all_infos_about_MdLs(wahlperiode, verbose) for key, mdl in mdls.items(): #if mdl.parl_pres: # print(f'{mdl.first_name} {mdl.last_name}, {mdl.party}, {mdl.parl_pres}') #print(f'{mdl.first_name} {mdl.last_name}, {mdl.party}') printout(mdl)
def chose_term(self): self.legislature = ask_for_wahlperiode()