def main(outfile): if not outfile.endswith(CSV_EXTENSION): outfile += CSV_EXTENSION site = Site() site.login(auto=True) services_category = site.categories['Κατάλογος Διαδικασιών'] services_category_text = services_category.text() services_page_schema_xml = re.sub(r'{{.*}}', '', services_category_text, flags=re.S) services_page_schema = xmltodict.parse(services_page_schema_xml) # import json # print(json.dumps(services_page_schema, indent=4, ensure_ascii=False)) with open(outfile, 'w') as f: csv_output = csv.writer(f) header_row = create_header_row() header_row_length = len(header_row) csv_output.writerow(header_row) templates = services_page_schema[PAGE_SCHEMA_KEY][TEMPLATE_KEY] for template in templates: for field in template[FIELD_KEY]: row = get_template_row(template) field_row = get_field_row(field) row.extend(field_row) row_length = len(row) assert row_length == header_row_length csv_output.writerow(row)
def __init__(self): self.__site = Site() self._site_logged_in = False # Dictionaries self.__data_by_code = {} self.__name_by_code = {} self.__code_by_name = {} self.__purpose_by_id = None self.__type_by_id = None
def main(outfile, namespace=PUBLISHED_NAMESPACE): if not outfile.endswith(CSV_EXTENSION): outfile += CSV_EXTENSION site = Site() site.login(auto=True) services_category = site.categories['Κατάλογος Διαδικασιών'] with open(outfile, 'w') as f: csv_output = csv.DictWriter(f, fieldnames=header_row) csv_output.writeheader() for page in services_category: name = page.name split_name = name.split(':') ns = split_name[0] if ns == namespace: # print only selected Namespace service_dict = _service_dict(TemplateEditor(page.text()), ns) # print(service_dict) csv_output.writerow(service_dict) f.close()
'Πολίτες άλλων κρατώνΕνημέρωση και επικαιροποίηση στοιχείων πολίτη': 'Πολίτες άλλων κρατών,Ενημέρωση και επικαιροποίηση στοιχείων πολίτη', 'Πολίτες άλλων κρατώνΕξ αποστάσεως εξυπηρέτηση πολιτών': 'Πολίτες άλλων κρατών,Εξ αποστάσεως εξυπηρέτηση πολιτών', 'Πολίτες άλλων κρατώνΤουρισμός': 'Πολίτες άλλων κρατών,Τουρισμός', 'Τελωνειακές υπηρεσίεςΑδειοδοτήσεις και συμμόρφωση': 'Τελωνειακές υπηρεσίες,Αδειοδοτήσεις και συμμόρφωση', 'ΤηλεπικοινωνίεςΤαχυδρομεία': 'Τηλεπικοινωνίες,Ταχυδρομεία', 'ΤρόφιμαΓεωργία': 'Τρόφιμα,Γεωργία', 'ΤρόφιμαΕπιδοτήσεις': 'Τρόφιμα,Επιδοτήσεις', 'Φάκελος υγείαςΕπίσκεψη και νοσηλεία σε νοσοκομείο': 'Φάκελος υγείας,Επίσκεψη και νοσηλεία σε νοσοκομείο', 'Φορολογία επιχειρήσεωνΑπασχόληση προσωπικού': 'Φορολογία επιχειρήσεων,Απασχόληση προσωπικού' } site = Site() site.login(auto=True) for page in site.categories[Service.CATEGORY_NAME]: page_text = page.text() for wrong_le, correct_le in WRONG_LIFE_EVENTS_MAP.items(): if wrong_le in page_text: page.edit(page_text.replace(wrong_le, correct_le)) print(f'Fixed life events {wrong_le} for {page.page_title}') break
class Organization: # API API_BASE_URL = 'https://hr.apografi.gov.gr/api' ORGS_ENDPOINT = f'{API_BASE_URL}/public/organizations' ORGS_TREE_URL_PREFIX = \ f'{API_BASE_URL}/public/organization-tree?organizationCode=' SEARCH_ORG_ENDPOINT = f'{ORGS_ENDPOINT}/search' DICT_ENDPOINT = f'{API_BASE_URL}/public/metadata/dictionary' PURPOSES_DICT_ENDPOINT = f'{DICT_ENDPOINT}/Functions' TYPES_DICT_ENDPOINT = f'{DICT_ENDPOINT}/OrganizationTypes' # Files ALL_ORGS_PICKLE_FILE = INOUT_FILES['org_all'] HIERARCHY_PICKLE_FILE = INOUT_FILES['org_hierarchy'] DETAILS_PICKLE_FILE = INOUT_FILES['org_details'] # Mediawiki CATEGORY_NAME = 'Φορείς' CATALOGUE_CATEGORY_NAME = 'Κατάλογος Φορέων' CATEGORY = f'[[Category:{CATEGORY_NAME}]]' CATALOGUE_CATEGORY = f'[[Category:{CATALOGUE_CATEGORY_NAME}]]' NAMESPACE = 'Φορέας' NAMESPACE_NUMBER = 9000 TEMPLATE_CONTACT_POINT_TELEPHONE = 'contactPoint_telephone' TEMPLATE_FIELD_NAME_SUFFIXES = [ 'code', 'preferredLabel', 'alternativeLabels', 'description', 'url', TEMPLATE_CONTACT_POINT_TELEPHONE, 'contactPoint_email', 'mainAddress_fullAddress', 'mainAddress_postCode', 'subOrganizationOf', 'identifier', 'purpose', 'vatId', 'status', 'foundationDate', 'terminationDate', 'organizationType', ] TEMPLATE_NAME = 'Φορέας' TEMPLATE_PARAM_PREFIX = 'gov_org_' template_parameters = [] for key in TEMPLATE_FIELD_NAME_SUFFIXES: template_parameters.append(f'|{TEMPLATE_PARAM_PREFIX}{key}=') TEMPLATE_PARAMETERS_TEXT = ''.join(template_parameters) TEMPLATE = f'{{{{{TEMPLATE_NAME}{TEMPLATE_PARAMETERS_TEXT}}}}}' # Miscellaneous STATUS_TRANSLATION = {'Active': 'Ενεργός', 'Inactive': 'Ανενεργός'} def __init__(self): self.__site = Site() self._site_logged_in = False # Dictionaries self.__data_by_code = {} self.__name_by_code = {} self.__code_by_name = {} self.__purpose_by_id = None self.__type_by_id = None @property def _site(self): if not self._site_logged_in: try: self.__site.login(auto=True) except SiteError as e: _error(str(e)) else: self._site_logged_in = True return self.__site @property def _purpose_by_id(self): if self.__purpose_by_id is None: self.__purpose_by_id = _dict_from_api_endpoint( self.PURPOSES_DICT_ENDPOINT) return self.__purpose_by_id @property def _type_by_id(self): if self.__type_by_id is None: self.__type_by_id = _dict_from_api_endpoint( self.TYPES_DICT_ENDPOINT) return self.__type_by_id def _data_by_code(self, code): data = self.__data_by_code.get(code) if data is None: try: self.__data_by_code[code] = requests.get( f'{self.ORGS_ENDPOINT}/{code}').json()['data'] except Exception: logger.error(f'No data found for org: {code}') else: data = self.__data_by_code.get(code) return data def _name_by_code(self, code): name = self.__name_by_code.get(code) if name is None: data = self._data_by_code(code) if data is not None: name = data['preferredLabel'] self.__name_by_code[code] = name return name def _code_by_name(self, name): code = self.__code_by_name.get(name) if code is None: self.__code_by_name = { ' '.join(org_dict['preferredLabel'].split()): org_dict['code'] for org_dict in self._all() } code = self.__code_by_name.get(name) return code def _tree_by_code(self, code): orgs_tree_url = f'{self.ORGS_TREE_URL_PREFIX}{code}' try: tree_dict = requests.get(orgs_tree_url).json()['data'] except Exception: logger.error(f'Failed to request {orgs_tree_url}.') tree_dict = None return tree_dict def _get_site_page(self, name, is_category=False): try: return self._site.categories[name] if is_category else \ self._site.pages(name) except Exception as e: logger.error(e, name) return None def _fetch_all_from_api(self): logger.debug('Fetching all orgs from API...') all_orgs = requests.get(self.ORGS_ENDPOINT).json()['data'] logger.debug('Fetched all orgs.') return all_orgs def _fetch_hierarchy_from_api(self): logger.debug('Fetching org hierarchy from API...') all_orgs = self._all() parent_children_orgs = {} for org_dict in all_orgs: parent_code = org_dict.get('subOrganizationOf') if parent_code is None: # parent_code does not exist, org_dict contains a root body orgcode = org_dict['code'] if orgcode not in parent_children_orgs: parent_children_orgs[orgcode] = { org_dict['preferredLabel']: [] } else: parentbody = parent_children_orgs.get(parent_code) if parentbody is None: # Parent body does not exist # Look in api orgs for org2 in all_orgs: if org2['code'] == parent_code: # Found parent body, add child body parent_children_orgs[parent_code] = { org2['preferredLabel']: [org_dict['preferredLabel']] } break else: # Parent body already exists, append child body parentbody[next(iter(parentbody))].append( org_dict['preferredLabel']) parent_children_orgs[parent_code] = parentbody hierarchy = {} for parentcode, parent_children_dict in parent_children_orgs.items(): hierarchy[next(iter(parent_children_dict))] = \ list(parent_children_dict.values())[0] logger.debug('Fetched org hierarchy.') return hierarchy def fetch_details_from_api(self, org_names=None): """Fetch organization details from the API. Args: org_names (list): The names of the organizations. Returns: dict: A dictionary of the details for each organization, as returned from the API. """ logger.debug('Fetching org details from API...') details = {} if org_names is None: org_names = self._all_page_names(without_namespace=True) for org in org_names: code = self._code_by_name(org) if code is None: continue data = self._data_by_code(code) if data is None: continue details[org] = data # Replace parent code with parent name (preferredLabel) parent_code = details[org].get('subOrganizationOf') if parent_code: parent_name = self._name_by_code(parent_code) if parent_name is None: parent_name = '' details[org]['subOrganizationOf'] = parent_name purpose_ids = details[org].get('purpose') # Replace purpose ids with purpose (function) names if purpose_ids: details[org]['purpose'] = ','.join( [self._purpose_by_id[id_] for id_ in purpose_ids]) # Replace status with greek translation status = details[org].get('status') if status: details[org]['status'] = self.STATUS_TRANSLATION[status] # Replace type id with type name type_id = details[org].get('organizationType') if type_id: details[org]['organizationType'] = self._type_by_id[type_id] logger.debug(f'{org} - fetched details') logger.debug('Fetched org details.') return details def _all(self, fetch_from_api=False): return _data(self.ALL_ORGS_PICKLE_FILE, self._fetch_all_from_api, fetch_from_api=fetch_from_api) def _hierarchy(self, fetch_from_api=False): return _data(self.HIERARCHY_PICKLE_FILE, self._fetch_hierarchy_from_api, fetch_from_api=fetch_from_api) def _details(self, fetch_from_api=False): return _data(self.DETAILS_PICKLE_FILE, self.fetch_details_from_api, fetch_from_api=fetch_from_api) def _all_page_names(self, without_namespace=False): action = 'query' list_param = 'allpages' continue_param = 'continue' apcontinue_param = 'apcontinue' kwargs = { 'format': 'json', 'list': list_param, 'apnamespace': self.NAMESPACE_NUMBER, 'aplimit': 5000 } page_names = [] continue_value = 0 while continue_value is not None: answer = self._site.api(action, **kwargs) continue_value = answer.get(continue_param, {}).get(apcontinue_param) kwargs[apcontinue_param] = continue_value page_names += [ page_result['title'] for page_result in answer[action][list_param] ] if without_namespace: page_names = [ name.replace(f'{self.NAMESPACE}:', '') for name in page_names ] return page_names def _all_pages(self): for name in self._all_page_names(): page = self._get_site_page(name) if page is not None: yield page def _all_pages_simple(self): for page in self._site.categories[self.CATALOGUE_CATEGORY_NAME]: yield page def _create_pages(self, name, parent_category=None): if parent_category is None: parent_category = self.CATEGORY replace_text = None else: replace_text = self.CATEGORY category_page = self._get_site_page(name, is_category=True) _add_text_to_page(category_page, parent_category, replace_text=replace_text) page = self._get_site_page(f'{self.NAMESPACE}:{name}') _add_text_to_page(page, self.CATALOGUE_CATEGORY) @_cli_command def recreate_tree(self, fetch_from_api=False): """Create new organization category tree and pages. Args: fetch_from_api (bool): Whether to fetch new organization data from the API or read the most recently saved data. """ logger.debug('Creating organization category tree and pages...') for parent, children in self._hierarchy( fetch_from_api=fetch_from_api).items(): self._create_pages(parent) parent_category = f'[[Category:{parent}]]' for child in children: self._create_pages(child, parent_category=parent_category) logger.debug('Done.') @_cli_command def nuke_tree(self): """Nuke organization category tree and pages.""" logger.debug('Nuking organization category tree and pages...') def recurse_delete(page): if page.exists: page_is_category = True try: page_members = page.members() except AttributeError: # page is not a category (no members) page_is_category = False else: # page is a category for member in page_members: recurse_delete(member) finally: if page_is_category or page.name.startswith( self.NAMESPACE): page.delete() logger.debug(f'{page.name} deleted.') root_category_page = self._site.categories[self.CATEGORY_NAME] for page in root_category_page.members(): recurse_delete(page) logger.debug('Done.') @_cli_command def update_pages(self, fetch_from_api=False, details=None, force_create=False): """Update organization pages from apografi API. Args: fetch_from_api (bool): Whether to fetch new organization data from the API or read the most recently saved data. details (dict): A dictionary of the details for each organization, as returned from the API. force_create (bool): Whether to create new organization pages that do not exist on the site. By default, non-existent pages are ignored. """ logger.debug('Updating organization pages...') def template_text(org_details): te = TemplateEditor(self.TEMPLATE) template = te.templates[self.TEMPLATE_NAME][0] # Add details to template parameters for key in self.TEMPLATE_FIELD_NAME_SUFFIXES: if '_' in key: details_keys = key.split('_') else: details_keys = None if details_keys is None: value = org_details.get(key, None) else: value = org_details.get(details_keys[0], {}).get(details_keys[1], None) if value is not None: if isinstance(value, list): value = ','.join(value) value = escape(str(value)) # Clean up telephone value if key == self.TEMPLATE_CONTACT_POINT_TELEPHONE: value = value.replace(' ', '').replace('+30', '') new_value = '' for c in value: if not c.isdigit(): break new_value += c value = new_value template.parameters[ f'{self.TEMPLATE_PARAM_PREFIX}{key}'] = value return str(template).replace(' |', '|') if details is None: details = self._details(fetch_from_api=fetch_from_api) for org, org_details in details.items(): page = self._get_site_page(f'{self.NAMESPACE}:{org}') page_condition = page is not None if not force_create: page_condition = page_condition and page.exists if page_condition: page_text = page.text() page_text_leftovers = re.sub( rf'{{{{{self.TEMPLATE_NAME}[^{{}}]+}}}}', '', page_text).strip() new_template_text = template_text(org_details) new_page_text = f'{new_template_text}\n{page_text_leftovers}' page.edit(new_page_text) logger.debug(f'{page.name} updated') logger.debug('Done.') @_cli_command def delete_old(self, fetch_from_api=False, dry_run=False): """Delete old organizations (removed from apografi API). Args: fetch_from_api (bool): Whether to fetch new organization data from the API or read the most recently saved data. dry_run (bool): Whether to perform a dry run or do the actual deletion. """ latest_org_names = [ re.sub(' +', ' ', org['preferredLabel'].strip()) for org in self._all(fetch_from_api=fetch_from_api) ] for org_page in self._all_pages_simple(): org_page_title = org_page.page_title if org_page_title not in latest_org_names: if dry_run: print(f'SHOULD BE DELETED: {org_page_title}') else: reason = 'Παλιός φορέας (δεν υπάρχει πια στην Απογραφή)' cat_str = '' org_page.delete(reason=reason) org_category_page = self._get_site_page(org_page_title, is_category=True) if org_category_page.exists: org_category_page.delete(reason=reason) cat_str = 'AND CATEGORY ' print(f'PAGE {cat_str}WAS DELETED: {org_page_title}') def units(self, name, unit_types=None): """Return the units of an organization. Args: name (string): The name of the organization. unit_types (list): A list of unit types, e.g. [4, 3, 41] Returns: list: The units of the organization. E.g. [ { "code": "100117", "preferredLabel": "ΓΕΝΙΚΗ ΓΡΑΜΜΑΤΕΙΑ ΔΗΜΟΣΙΑΣ ΥΓΕΙΑΣ", "unitType": 41 }, { "code": "521532", "preferredLabel": "ΓΕΝΙΚΗ ΓΡΑΜΜΑΤΕΙΑ ΥΠΗΡΕΣΙΩΝ ΥΓΕΙΑΣ", "unitType": 41 } ] """ units = [] def add_sub_unit(unit): children = unit.get('children') if children is not None: for child in children: add_sub_unit(child) del unit['children'] units.append(unit) org_code = self._code_by_name(name) if org_code is not None: org_tree = self._tree_by_code(org_code) if org_tree is not None: for org in org_tree.get('children', []): add_sub_unit(org) if unit_types is not None: units = [ unit for unit in units if unit['unitType'] in unit_types ] result = units return result
#!/usr/bin/env python3 from diavlos.src.site import Site for page in Site().categories['Κατάλογος Φορέων']: page_title = page.page_title if 'Φορέας:' not in page_title: print(page_title)
def __init__(self, site=None): self._site = site or Site()
from diavlos.src.eparavolo.error import eParavoloErrorCode from diavlos.src.eparavolo.error import eParavoloErrorData from diavlos.src.diaugeia.error import DiaugeiaErrorCode from diavlos.src.diaugeia.error import DiaugeiaErrorData from diavlos.src.helper.error import ErrorCode from diavlos.src.metadata import Metadata from diavlos.src.diaugeia import Diaugeia from diavlos.src.organization import Organization from diavlos.src.service import Service from diavlos.src.service.error import ServiceErrorData from diavlos.src.site import Site from diavlos.src.site import SiteError logging.basicConfig(level=logging.INFO) default_site = greek_site = Site() english_site = Site(config_file=IN_FILES['english_site_config']) service = Service(site=default_site) eparavolo = eParavolo() organization = Organization() metadata = Metadata() diaugeia = Diaugeia() def make_response(func): @functools.wraps(func) def wrapper(*args, **kwargs): result = func(*args, **kwargs) if isinstance(result, str) and not result.startswith('<'): response = {'success': False, 'message': result} status_code = 400
#!/usr/bin/env python3 from diavlos.src.site import Site for page in Site().categories['Κατάλογος Μητρώων']: page_title = page.page_title if 'Μητρώο:' not in page_title: print(page_title)