def main(outfile): if not outfile.endswith(CSV_EXTENSION): outfile += CSV_EXTENSION site = Site() site.login(auto=True) services_category = site.categories['Κατάλογος Διαδικασιών'] services_category_text = services_category.text() services_page_schema_xml = re.sub(r'{{.*}}', '', services_category_text, flags=re.S) services_page_schema = xmltodict.parse(services_page_schema_xml) # import json # print(json.dumps(services_page_schema, indent=4, ensure_ascii=False)) with open(outfile, 'w') as f: csv_output = csv.writer(f) header_row = create_header_row() header_row_length = len(header_row) csv_output.writerow(header_row) templates = services_page_schema[PAGE_SCHEMA_KEY][TEMPLATE_KEY] for template in templates: for field in template[FIELD_KEY]: row = get_template_row(template) field_row = get_field_row(field) row.extend(field_row) row_length = len(row) assert row_length == header_row_length csv_output.writerow(row)
def main(outfile, namespace=PUBLISHED_NAMESPACE): if not outfile.endswith(CSV_EXTENSION): outfile += CSV_EXTENSION site = Site() site.login(auto=True) services_category = site.categories['Κατάλογος Διαδικασιών'] with open(outfile, 'w') as f: csv_output = csv.DictWriter(f, fieldnames=header_row) csv_output.writeheader() for page in services_category: name = page.name split_name = name.split(':') ns = split_name[0] if ns == namespace: # print only selected Namespace service_dict = _service_dict(TemplateEditor(page.text()), ns) # print(service_dict) csv_output.writerow(service_dict) f.close()
'Πολίτες άλλων κρατώνΕνημέρωση και επικαιροποίηση στοιχείων πολίτη': 'Πολίτες άλλων κρατών,Ενημέρωση και επικαιροποίηση στοιχείων πολίτη', 'Πολίτες άλλων κρατώνΕξ αποστάσεως εξυπηρέτηση πολιτών': 'Πολίτες άλλων κρατών,Εξ αποστάσεως εξυπηρέτηση πολιτών', 'Πολίτες άλλων κρατώνΤουρισμός': 'Πολίτες άλλων κρατών,Τουρισμός', 'Τελωνειακές υπηρεσίεςΑδειοδοτήσεις και συμμόρφωση': 'Τελωνειακές υπηρεσίες,Αδειοδοτήσεις και συμμόρφωση', 'ΤηλεπικοινωνίεςΤαχυδρομεία': 'Τηλεπικοινωνίες,Ταχυδρομεία', 'ΤρόφιμαΓεωργία': 'Τρόφιμα,Γεωργία', 'ΤρόφιμαΕπιδοτήσεις': 'Τρόφιμα,Επιδοτήσεις', 'Φάκελος υγείαςΕπίσκεψη και νοσηλεία σε νοσοκομείο': 'Φάκελος υγείας,Επίσκεψη και νοσηλεία σε νοσοκομείο', 'Φορολογία επιχειρήσεωνΑπασχόληση προσωπικού': 'Φορολογία επιχειρήσεων,Απασχόληση προσωπικού' } site = Site() site.login(auto=True) for page in site.categories[Service.CATEGORY_NAME]: page_text = page.text() for wrong_le, correct_le in WRONG_LIFE_EVENTS_MAP.items(): if wrong_le in page_text: page.edit(page_text.replace(wrong_le, correct_le)) print(f'Fixed life events {wrong_le} for {page.page_title}') break
class Organization: # API API_BASE_URL = 'https://hr.apografi.gov.gr/api' ORGS_ENDPOINT = f'{API_BASE_URL}/public/organizations' ORGS_TREE_URL_PREFIX = \ f'{API_BASE_URL}/public/organization-tree?organizationCode=' SEARCH_ORG_ENDPOINT = f'{ORGS_ENDPOINT}/search' DICT_ENDPOINT = f'{API_BASE_URL}/public/metadata/dictionary' PURPOSES_DICT_ENDPOINT = f'{DICT_ENDPOINT}/Functions' TYPES_DICT_ENDPOINT = f'{DICT_ENDPOINT}/OrganizationTypes' # Files ALL_ORGS_PICKLE_FILE = INOUT_FILES['org_all'] HIERARCHY_PICKLE_FILE = INOUT_FILES['org_hierarchy'] DETAILS_PICKLE_FILE = INOUT_FILES['org_details'] # Mediawiki CATEGORY_NAME = 'Φορείς' CATALOGUE_CATEGORY_NAME = 'Κατάλογος Φορέων' CATEGORY = f'[[Category:{CATEGORY_NAME}]]' CATALOGUE_CATEGORY = f'[[Category:{CATALOGUE_CATEGORY_NAME}]]' NAMESPACE = 'Φορέας' NAMESPACE_NUMBER = 9000 TEMPLATE_CONTACT_POINT_TELEPHONE = 'contactPoint_telephone' TEMPLATE_FIELD_NAME_SUFFIXES = [ 'code', 'preferredLabel', 'alternativeLabels', 'description', 'url', TEMPLATE_CONTACT_POINT_TELEPHONE, 'contactPoint_email', 'mainAddress_fullAddress', 'mainAddress_postCode', 'subOrganizationOf', 'identifier', 'purpose', 'vatId', 'status', 'foundationDate', 'terminationDate', 'organizationType', ] TEMPLATE_NAME = 'Φορέας' TEMPLATE_PARAM_PREFIX = 'gov_org_' template_parameters = [] for key in TEMPLATE_FIELD_NAME_SUFFIXES: template_parameters.append(f'|{TEMPLATE_PARAM_PREFIX}{key}=') TEMPLATE_PARAMETERS_TEXT = ''.join(template_parameters) TEMPLATE = f'{{{{{TEMPLATE_NAME}{TEMPLATE_PARAMETERS_TEXT}}}}}' # Miscellaneous STATUS_TRANSLATION = {'Active': 'Ενεργός', 'Inactive': 'Ανενεργός'} def __init__(self): self.__site = Site() self._site_logged_in = False # Dictionaries self.__data_by_code = {} self.__name_by_code = {} self.__code_by_name = {} self.__purpose_by_id = None self.__type_by_id = None @property def _site(self): if not self._site_logged_in: try: self.__site.login(auto=True) except SiteError as e: _error(str(e)) else: self._site_logged_in = True return self.__site @property def _purpose_by_id(self): if self.__purpose_by_id is None: self.__purpose_by_id = _dict_from_api_endpoint( self.PURPOSES_DICT_ENDPOINT) return self.__purpose_by_id @property def _type_by_id(self): if self.__type_by_id is None: self.__type_by_id = _dict_from_api_endpoint( self.TYPES_DICT_ENDPOINT) return self.__type_by_id def _data_by_code(self, code): data = self.__data_by_code.get(code) if data is None: try: self.__data_by_code[code] = requests.get( f'{self.ORGS_ENDPOINT}/{code}').json()['data'] except Exception: logger.error(f'No data found for org: {code}') else: data = self.__data_by_code.get(code) return data def _name_by_code(self, code): name = self.__name_by_code.get(code) if name is None: data = self._data_by_code(code) if data is not None: name = data['preferredLabel'] self.__name_by_code[code] = name return name def _code_by_name(self, name): code = self.__code_by_name.get(name) if code is None: self.__code_by_name = { ' '.join(org_dict['preferredLabel'].split()): org_dict['code'] for org_dict in self._all() } code = self.__code_by_name.get(name) return code def _tree_by_code(self, code): orgs_tree_url = f'{self.ORGS_TREE_URL_PREFIX}{code}' try: tree_dict = requests.get(orgs_tree_url).json()['data'] except Exception: logger.error(f'Failed to request {orgs_tree_url}.') tree_dict = None return tree_dict def _get_site_page(self, name, is_category=False): try: return self._site.categories[name] if is_category else \ self._site.pages(name) except Exception as e: logger.error(e, name) return None def _fetch_all_from_api(self): logger.debug('Fetching all orgs from API...') all_orgs = requests.get(self.ORGS_ENDPOINT).json()['data'] logger.debug('Fetched all orgs.') return all_orgs def _fetch_hierarchy_from_api(self): logger.debug('Fetching org hierarchy from API...') all_orgs = self._all() parent_children_orgs = {} for org_dict in all_orgs: parent_code = org_dict.get('subOrganizationOf') if parent_code is None: # parent_code does not exist, org_dict contains a root body orgcode = org_dict['code'] if orgcode not in parent_children_orgs: parent_children_orgs[orgcode] = { org_dict['preferredLabel']: [] } else: parentbody = parent_children_orgs.get(parent_code) if parentbody is None: # Parent body does not exist # Look in api orgs for org2 in all_orgs: if org2['code'] == parent_code: # Found parent body, add child body parent_children_orgs[parent_code] = { org2['preferredLabel']: [org_dict['preferredLabel']] } break else: # Parent body already exists, append child body parentbody[next(iter(parentbody))].append( org_dict['preferredLabel']) parent_children_orgs[parent_code] = parentbody hierarchy = {} for parentcode, parent_children_dict in parent_children_orgs.items(): hierarchy[next(iter(parent_children_dict))] = \ list(parent_children_dict.values())[0] logger.debug('Fetched org hierarchy.') return hierarchy def fetch_details_from_api(self, org_names=None): """Fetch organization details from the API. Args: org_names (list): The names of the organizations. Returns: dict: A dictionary of the details for each organization, as returned from the API. """ logger.debug('Fetching org details from API...') details = {} if org_names is None: org_names = self._all_page_names(without_namespace=True) for org in org_names: code = self._code_by_name(org) if code is None: continue data = self._data_by_code(code) if data is None: continue details[org] = data # Replace parent code with parent name (preferredLabel) parent_code = details[org].get('subOrganizationOf') if parent_code: parent_name = self._name_by_code(parent_code) if parent_name is None: parent_name = '' details[org]['subOrganizationOf'] = parent_name purpose_ids = details[org].get('purpose') # Replace purpose ids with purpose (function) names if purpose_ids: details[org]['purpose'] = ','.join( [self._purpose_by_id[id_] for id_ in purpose_ids]) # Replace status with greek translation status = details[org].get('status') if status: details[org]['status'] = self.STATUS_TRANSLATION[status] # Replace type id with type name type_id = details[org].get('organizationType') if type_id: details[org]['organizationType'] = self._type_by_id[type_id] logger.debug(f'{org} - fetched details') logger.debug('Fetched org details.') return details def _all(self, fetch_from_api=False): return _data(self.ALL_ORGS_PICKLE_FILE, self._fetch_all_from_api, fetch_from_api=fetch_from_api) def _hierarchy(self, fetch_from_api=False): return _data(self.HIERARCHY_PICKLE_FILE, self._fetch_hierarchy_from_api, fetch_from_api=fetch_from_api) def _details(self, fetch_from_api=False): return _data(self.DETAILS_PICKLE_FILE, self.fetch_details_from_api, fetch_from_api=fetch_from_api) def _all_page_names(self, without_namespace=False): action = 'query' list_param = 'allpages' continue_param = 'continue' apcontinue_param = 'apcontinue' kwargs = { 'format': 'json', 'list': list_param, 'apnamespace': self.NAMESPACE_NUMBER, 'aplimit': 5000 } page_names = [] continue_value = 0 while continue_value is not None: answer = self._site.api(action, **kwargs) continue_value = answer.get(continue_param, {}).get(apcontinue_param) kwargs[apcontinue_param] = continue_value page_names += [ page_result['title'] for page_result in answer[action][list_param] ] if without_namespace: page_names = [ name.replace(f'{self.NAMESPACE}:', '') for name in page_names ] return page_names def _all_pages(self): for name in self._all_page_names(): page = self._get_site_page(name) if page is not None: yield page def _all_pages_simple(self): for page in self._site.categories[self.CATALOGUE_CATEGORY_NAME]: yield page def _create_pages(self, name, parent_category=None): if parent_category is None: parent_category = self.CATEGORY replace_text = None else: replace_text = self.CATEGORY category_page = self._get_site_page(name, is_category=True) _add_text_to_page(category_page, parent_category, replace_text=replace_text) page = self._get_site_page(f'{self.NAMESPACE}:{name}') _add_text_to_page(page, self.CATALOGUE_CATEGORY) @_cli_command def recreate_tree(self, fetch_from_api=False): """Create new organization category tree and pages. Args: fetch_from_api (bool): Whether to fetch new organization data from the API or read the most recently saved data. """ logger.debug('Creating organization category tree and pages...') for parent, children in self._hierarchy( fetch_from_api=fetch_from_api).items(): self._create_pages(parent) parent_category = f'[[Category:{parent}]]' for child in children: self._create_pages(child, parent_category=parent_category) logger.debug('Done.') @_cli_command def nuke_tree(self): """Nuke organization category tree and pages.""" logger.debug('Nuking organization category tree and pages...') def recurse_delete(page): if page.exists: page_is_category = True try: page_members = page.members() except AttributeError: # page is not a category (no members) page_is_category = False else: # page is a category for member in page_members: recurse_delete(member) finally: if page_is_category or page.name.startswith( self.NAMESPACE): page.delete() logger.debug(f'{page.name} deleted.') root_category_page = self._site.categories[self.CATEGORY_NAME] for page in root_category_page.members(): recurse_delete(page) logger.debug('Done.') @_cli_command def update_pages(self, fetch_from_api=False, details=None, force_create=False): """Update organization pages from apografi API. Args: fetch_from_api (bool): Whether to fetch new organization data from the API or read the most recently saved data. details (dict): A dictionary of the details for each organization, as returned from the API. force_create (bool): Whether to create new organization pages that do not exist on the site. By default, non-existent pages are ignored. """ logger.debug('Updating organization pages...') def template_text(org_details): te = TemplateEditor(self.TEMPLATE) template = te.templates[self.TEMPLATE_NAME][0] # Add details to template parameters for key in self.TEMPLATE_FIELD_NAME_SUFFIXES: if '_' in key: details_keys = key.split('_') else: details_keys = None if details_keys is None: value = org_details.get(key, None) else: value = org_details.get(details_keys[0], {}).get(details_keys[1], None) if value is not None: if isinstance(value, list): value = ','.join(value) value = escape(str(value)) # Clean up telephone value if key == self.TEMPLATE_CONTACT_POINT_TELEPHONE: value = value.replace(' ', '').replace('+30', '') new_value = '' for c in value: if not c.isdigit(): break new_value += c value = new_value template.parameters[ f'{self.TEMPLATE_PARAM_PREFIX}{key}'] = value return str(template).replace(' |', '|') if details is None: details = self._details(fetch_from_api=fetch_from_api) for org, org_details in details.items(): page = self._get_site_page(f'{self.NAMESPACE}:{org}') page_condition = page is not None if not force_create: page_condition = page_condition and page.exists if page_condition: page_text = page.text() page_text_leftovers = re.sub( rf'{{{{{self.TEMPLATE_NAME}[^{{}}]+}}}}', '', page_text).strip() new_template_text = template_text(org_details) new_page_text = f'{new_template_text}\n{page_text_leftovers}' page.edit(new_page_text) logger.debug(f'{page.name} updated') logger.debug('Done.') @_cli_command def delete_old(self, fetch_from_api=False, dry_run=False): """Delete old organizations (removed from apografi API). Args: fetch_from_api (bool): Whether to fetch new organization data from the API or read the most recently saved data. dry_run (bool): Whether to perform a dry run or do the actual deletion. """ latest_org_names = [ re.sub(' +', ' ', org['preferredLabel'].strip()) for org in self._all(fetch_from_api=fetch_from_api) ] for org_page in self._all_pages_simple(): org_page_title = org_page.page_title if org_page_title not in latest_org_names: if dry_run: print(f'SHOULD BE DELETED: {org_page_title}') else: reason = 'Παλιός φορέας (δεν υπάρχει πια στην Απογραφή)' cat_str = '' org_page.delete(reason=reason) org_category_page = self._get_site_page(org_page_title, is_category=True) if org_category_page.exists: org_category_page.delete(reason=reason) cat_str = 'AND CATEGORY ' print(f'PAGE {cat_str}WAS DELETED: {org_page_title}') def units(self, name, unit_types=None): """Return the units of an organization. Args: name (string): The name of the organization. unit_types (list): A list of unit types, e.g. [4, 3, 41] Returns: list: The units of the organization. E.g. [ { "code": "100117", "preferredLabel": "ΓΕΝΙΚΗ ΓΡΑΜΜΑΤΕΙΑ ΔΗΜΟΣΙΑΣ ΥΓΕΙΑΣ", "unitType": 41 }, { "code": "521532", "preferredLabel": "ΓΕΝΙΚΗ ΓΡΑΜΜΑΤΕΙΑ ΥΠΗΡΕΣΙΩΝ ΥΓΕΙΑΣ", "unitType": 41 } ] """ units = [] def add_sub_unit(unit): children = unit.get('children') if children is not None: for child in children: add_sub_unit(child) del unit['children'] units.append(unit) org_code = self._code_by_name(name) if org_code is not None: org_tree = self._tree_by_code(org_code) if org_tree is not None: for org in org_tree.get('children', []): add_sub_unit(org) if unit_types is not None: units = [ unit for unit in units if unit['unitType'] in unit_types ] result = units return result