Ejemplo n.º 1
0
def main(outfile):
    if not outfile.endswith(CSV_EXTENSION):
        outfile += CSV_EXTENSION
    site = Site()
    site.login(auto=True)
    services_category = site.categories['Κατάλογος Διαδικασιών']
    services_category_text = services_category.text()
    services_page_schema_xml = re.sub(r'{{.*}}',
                                      '',
                                      services_category_text,
                                      flags=re.S)
    services_page_schema = xmltodict.parse(services_page_schema_xml)
    # import json
    # print(json.dumps(services_page_schema, indent=4, ensure_ascii=False))
    with open(outfile, 'w') as f:
        csv_output = csv.writer(f)
        header_row = create_header_row()
        header_row_length = len(header_row)
        csv_output.writerow(header_row)
        templates = services_page_schema[PAGE_SCHEMA_KEY][TEMPLATE_KEY]
        for template in templates:
            for field in template[FIELD_KEY]:
                row = get_template_row(template)
                field_row = get_field_row(field)
                row.extend(field_row)
                row_length = len(row)
                assert row_length == header_row_length
                csv_output.writerow(row)
def main(outfile, namespace=PUBLISHED_NAMESPACE):
    if not outfile.endswith(CSV_EXTENSION):
        outfile += CSV_EXTENSION
    site = Site()
    site.login(auto=True)
    services_category = site.categories['Κατάλογος Διαδικασιών']

    with open(outfile, 'w') as f:
        csv_output = csv.DictWriter(f, fieldnames=header_row)

        csv_output.writeheader()

        for page in services_category:
            name = page.name
            split_name = name.split(':')
            ns = split_name[0]
            if ns == namespace:  # print only selected Namespace
                service_dict = _service_dict(TemplateEditor(page.text()), ns)
                # print(service_dict)
                csv_output.writerow(service_dict)

        f.close()
    'Πολίτες άλλων κρατώνΕνημέρωση και επικαιροποίηση στοιχείων πολίτη':
    'Πολίτες άλλων κρατών,Ενημέρωση και επικαιροποίηση στοιχείων πολίτη',
    'Πολίτες άλλων κρατώνΕξ αποστάσεως εξυπηρέτηση πολιτών':
    'Πολίτες άλλων κρατών,Εξ αποστάσεως εξυπηρέτηση πολιτών',
    'Πολίτες άλλων κρατώνΤουρισμός':
    'Πολίτες άλλων κρατών,Τουρισμός',
    'Τελωνειακές υπηρεσίεςΑδειοδοτήσεις και συμμόρφωση':
    'Τελωνειακές υπηρεσίες,Αδειοδοτήσεις και συμμόρφωση',
    'ΤηλεπικοινωνίεςΤαχυδρομεία':
    'Τηλεπικοινωνίες,Ταχυδρομεία',
    'ΤρόφιμαΓεωργία':
    'Τρόφιμα,Γεωργία',
    'ΤρόφιμαΕπιδοτήσεις':
    'Τρόφιμα,Επιδοτήσεις',
    'Φάκελος υγείαςΕπίσκεψη και νοσηλεία σε νοσοκομείο':
    'Φάκελος υγείας,Επίσκεψη και νοσηλεία σε νοσοκομείο',
    'Φορολογία επιχειρήσεωνΑπασχόληση προσωπικού':
    'Φορολογία επιχειρήσεων,Απασχόληση προσωπικού'
}

site = Site()
site.login(auto=True)

for page in site.categories[Service.CATEGORY_NAME]:
    page_text = page.text()
    for wrong_le, correct_le in WRONG_LIFE_EVENTS_MAP.items():
        if wrong_le in page_text:
            page.edit(page_text.replace(wrong_le, correct_le))
            print(f'Fixed life events {wrong_le} for {page.page_title}')
            break
Ejemplo n.º 4
0
class Organization:

    # API
    API_BASE_URL = 'https://hr.apografi.gov.gr/api'
    ORGS_ENDPOINT = f'{API_BASE_URL}/public/organizations'
    ORGS_TREE_URL_PREFIX = \
        f'{API_BASE_URL}/public/organization-tree?organizationCode='
    SEARCH_ORG_ENDPOINT = f'{ORGS_ENDPOINT}/search'
    DICT_ENDPOINT = f'{API_BASE_URL}/public/metadata/dictionary'
    PURPOSES_DICT_ENDPOINT = f'{DICT_ENDPOINT}/Functions'
    TYPES_DICT_ENDPOINT = f'{DICT_ENDPOINT}/OrganizationTypes'
    # Files
    ALL_ORGS_PICKLE_FILE = INOUT_FILES['org_all']
    HIERARCHY_PICKLE_FILE = INOUT_FILES['org_hierarchy']
    DETAILS_PICKLE_FILE = INOUT_FILES['org_details']
    # Mediawiki
    CATEGORY_NAME = 'Φορείς'
    CATALOGUE_CATEGORY_NAME = 'Κατάλογος Φορέων'
    CATEGORY = f'[[Category:{CATEGORY_NAME}]]'
    CATALOGUE_CATEGORY = f'[[Category:{CATALOGUE_CATEGORY_NAME}]]'
    NAMESPACE = 'Φορέας'
    NAMESPACE_NUMBER = 9000
    TEMPLATE_CONTACT_POINT_TELEPHONE = 'contactPoint_telephone'
    TEMPLATE_FIELD_NAME_SUFFIXES = [
        'code',
        'preferredLabel',
        'alternativeLabels',
        'description',
        'url',
        TEMPLATE_CONTACT_POINT_TELEPHONE,
        'contactPoint_email',
        'mainAddress_fullAddress',
        'mainAddress_postCode',
        'subOrganizationOf',
        'identifier',
        'purpose',
        'vatId',
        'status',
        'foundationDate',
        'terminationDate',
        'organizationType',
    ]
    TEMPLATE_NAME = 'Φορέας'
    TEMPLATE_PARAM_PREFIX = 'gov_org_'
    template_parameters = []
    for key in TEMPLATE_FIELD_NAME_SUFFIXES:
        template_parameters.append(f'|{TEMPLATE_PARAM_PREFIX}{key}=')
    TEMPLATE_PARAMETERS_TEXT = ''.join(template_parameters)
    TEMPLATE = f'{{{{{TEMPLATE_NAME}{TEMPLATE_PARAMETERS_TEXT}}}}}'
    # Miscellaneous
    STATUS_TRANSLATION = {'Active': 'Ενεργός', 'Inactive': 'Ανενεργός'}

    def __init__(self):
        self.__site = Site()
        self._site_logged_in = False
        # Dictionaries
        self.__data_by_code = {}
        self.__name_by_code = {}
        self.__code_by_name = {}
        self.__purpose_by_id = None
        self.__type_by_id = None

    @property
    def _site(self):
        if not self._site_logged_in:
            try:
                self.__site.login(auto=True)
            except SiteError as e:
                _error(str(e))
            else:
                self._site_logged_in = True
        return self.__site

    @property
    def _purpose_by_id(self):
        if self.__purpose_by_id is None:
            self.__purpose_by_id = _dict_from_api_endpoint(
                self.PURPOSES_DICT_ENDPOINT)
        return self.__purpose_by_id

    @property
    def _type_by_id(self):
        if self.__type_by_id is None:
            self.__type_by_id = _dict_from_api_endpoint(
                self.TYPES_DICT_ENDPOINT)
        return self.__type_by_id

    def _data_by_code(self, code):
        data = self.__data_by_code.get(code)
        if data is None:
            try:
                self.__data_by_code[code] = requests.get(
                    f'{self.ORGS_ENDPOINT}/{code}').json()['data']
            except Exception:
                logger.error(f'No data found for org: {code}')
            else:
                data = self.__data_by_code.get(code)
        return data

    def _name_by_code(self, code):
        name = self.__name_by_code.get(code)
        if name is None:
            data = self._data_by_code(code)
            if data is not None:
                name = data['preferredLabel']
                self.__name_by_code[code] = name
        return name

    def _code_by_name(self, name):
        code = self.__code_by_name.get(name)
        if code is None:
            self.__code_by_name = {
                ' '.join(org_dict['preferredLabel'].split()): org_dict['code']
                for org_dict in self._all()
            }
            code = self.__code_by_name.get(name)
        return code

    def _tree_by_code(self, code):
        orgs_tree_url = f'{self.ORGS_TREE_URL_PREFIX}{code}'
        try:
            tree_dict = requests.get(orgs_tree_url).json()['data']
        except Exception:
            logger.error(f'Failed to request {orgs_tree_url}.')
            tree_dict = None
        return tree_dict

    def _get_site_page(self, name, is_category=False):
        try:
            return self._site.categories[name] if is_category else \
                self._site.pages(name)
        except Exception as e:
            logger.error(e, name)
            return None

    def _fetch_all_from_api(self):
        logger.debug('Fetching all orgs from API...')
        all_orgs = requests.get(self.ORGS_ENDPOINT).json()['data']
        logger.debug('Fetched all orgs.')
        return all_orgs

    def _fetch_hierarchy_from_api(self):
        logger.debug('Fetching org hierarchy from API...')
        all_orgs = self._all()
        parent_children_orgs = {}
        for org_dict in all_orgs:
            parent_code = org_dict.get('subOrganizationOf')
            if parent_code is None:
                # parent_code does not exist, org_dict contains a root body
                orgcode = org_dict['code']
                if orgcode not in parent_children_orgs:
                    parent_children_orgs[orgcode] = {
                        org_dict['preferredLabel']: []
                    }
            else:
                parentbody = parent_children_orgs.get(parent_code)
                if parentbody is None:
                    # Parent body does not exist
                    # Look in api orgs
                    for org2 in all_orgs:
                        if org2['code'] == parent_code:
                            # Found parent body, add child body
                            parent_children_orgs[parent_code] = {
                                org2['preferredLabel']:
                                [org_dict['preferredLabel']]
                            }
                            break
                else:
                    # Parent body already exists, append child body
                    parentbody[next(iter(parentbody))].append(
                        org_dict['preferredLabel'])
                    parent_children_orgs[parent_code] = parentbody
        hierarchy = {}
        for parentcode, parent_children_dict in parent_children_orgs.items():
            hierarchy[next(iter(parent_children_dict))] = \
                list(parent_children_dict.values())[0]
        logger.debug('Fetched org hierarchy.')
        return hierarchy

    def fetch_details_from_api(self, org_names=None):
        """Fetch organization details from the API.

        Args:
            org_names (list): The names of the organizations.

        Returns:
            dict: A dictionary of the details for each organization,
                as returned from the API.
        """
        logger.debug('Fetching org details from API...')
        details = {}
        if org_names is None:
            org_names = self._all_page_names(without_namespace=True)
        for org in org_names:
            code = self._code_by_name(org)
            if code is None:
                continue
            data = self._data_by_code(code)
            if data is None:
                continue
            details[org] = data
            # Replace parent code with parent name (preferredLabel)
            parent_code = details[org].get('subOrganizationOf')
            if parent_code:
                parent_name = self._name_by_code(parent_code)
                if parent_name is None:
                    parent_name = ''
                details[org]['subOrganizationOf'] = parent_name
            purpose_ids = details[org].get('purpose')
            # Replace purpose ids with purpose (function) names
            if purpose_ids:
                details[org]['purpose'] = ','.join(
                    [self._purpose_by_id[id_] for id_ in purpose_ids])
            # Replace status with greek translation
            status = details[org].get('status')
            if status:
                details[org]['status'] = self.STATUS_TRANSLATION[status]
            # Replace type id with type name
            type_id = details[org].get('organizationType')
            if type_id:
                details[org]['organizationType'] = self._type_by_id[type_id]
            logger.debug(f'{org} - fetched details')
        logger.debug('Fetched org details.')
        return details

    def _all(self, fetch_from_api=False):
        return _data(self.ALL_ORGS_PICKLE_FILE,
                     self._fetch_all_from_api,
                     fetch_from_api=fetch_from_api)

    def _hierarchy(self, fetch_from_api=False):
        return _data(self.HIERARCHY_PICKLE_FILE,
                     self._fetch_hierarchy_from_api,
                     fetch_from_api=fetch_from_api)

    def _details(self, fetch_from_api=False):
        return _data(self.DETAILS_PICKLE_FILE,
                     self.fetch_details_from_api,
                     fetch_from_api=fetch_from_api)

    def _all_page_names(self, without_namespace=False):
        action = 'query'
        list_param = 'allpages'
        continue_param = 'continue'
        apcontinue_param = 'apcontinue'
        kwargs = {
            'format': 'json',
            'list': list_param,
            'apnamespace': self.NAMESPACE_NUMBER,
            'aplimit': 5000
        }
        page_names = []
        continue_value = 0
        while continue_value is not None:
            answer = self._site.api(action, **kwargs)
            continue_value = answer.get(continue_param,
                                        {}).get(apcontinue_param)
            kwargs[apcontinue_param] = continue_value
            page_names += [
                page_result['title']
                for page_result in answer[action][list_param]
            ]
        if without_namespace:
            page_names = [
                name.replace(f'{self.NAMESPACE}:', '') for name in page_names
            ]
        return page_names

    def _all_pages(self):
        for name in self._all_page_names():
            page = self._get_site_page(name)
            if page is not None:
                yield page

    def _all_pages_simple(self):
        for page in self._site.categories[self.CATALOGUE_CATEGORY_NAME]:
            yield page

    def _create_pages(self, name, parent_category=None):
        if parent_category is None:
            parent_category = self.CATEGORY
            replace_text = None
        else:
            replace_text = self.CATEGORY
        category_page = self._get_site_page(name, is_category=True)
        _add_text_to_page(category_page,
                          parent_category,
                          replace_text=replace_text)
        page = self._get_site_page(f'{self.NAMESPACE}:{name}')
        _add_text_to_page(page, self.CATALOGUE_CATEGORY)

    @_cli_command
    def recreate_tree(self, fetch_from_api=False):
        """Create new organization category tree and pages.

        Args:
            fetch_from_api (bool): Whether to fetch new organization
                data from the API or read the most recently saved data.
        """
        logger.debug('Creating organization category tree and pages...')
        for parent, children in self._hierarchy(
                fetch_from_api=fetch_from_api).items():
            self._create_pages(parent)
            parent_category = f'[[Category:{parent}]]'
            for child in children:
                self._create_pages(child, parent_category=parent_category)
        logger.debug('Done.')

    @_cli_command
    def nuke_tree(self):
        """Nuke organization category tree and pages."""
        logger.debug('Nuking organization category tree and pages...')

        def recurse_delete(page):
            if page.exists:
                page_is_category = True
                try:
                    page_members = page.members()
                except AttributeError:
                    # page is not a category (no members)
                    page_is_category = False
                else:
                    # page is a category
                    for member in page_members:
                        recurse_delete(member)
                finally:
                    if page_is_category or page.name.startswith(
                            self.NAMESPACE):
                        page.delete()
                        logger.debug(f'{page.name} deleted.')

        root_category_page = self._site.categories[self.CATEGORY_NAME]
        for page in root_category_page.members():
            recurse_delete(page)
        logger.debug('Done.')

    @_cli_command
    def update_pages(self,
                     fetch_from_api=False,
                     details=None,
                     force_create=False):
        """Update organization pages from apografi API.

        Args:
            fetch_from_api (bool): Whether to fetch new organization
                data from the API or read the most recently saved data.
            details (dict): A dictionary of the details for each organization,
                as returned from the API.
            force_create (bool): Whether to create new organization pages that
                do not exist on the site. By default, non-existent pages are
                ignored.
        """
        logger.debug('Updating organization pages...')

        def template_text(org_details):
            te = TemplateEditor(self.TEMPLATE)
            template = te.templates[self.TEMPLATE_NAME][0]
            # Add details to template parameters
            for key in self.TEMPLATE_FIELD_NAME_SUFFIXES:
                if '_' in key:
                    details_keys = key.split('_')
                else:
                    details_keys = None
                if details_keys is None:
                    value = org_details.get(key, None)
                else:
                    value = org_details.get(details_keys[0],
                                            {}).get(details_keys[1], None)
                if value is not None:
                    if isinstance(value, list):
                        value = ','.join(value)
                    value = escape(str(value))
                    # Clean up telephone value
                    if key == self.TEMPLATE_CONTACT_POINT_TELEPHONE:
                        value = value.replace(' ', '').replace('+30', '')
                        new_value = ''
                        for c in value:
                            if not c.isdigit():
                                break
                            new_value += c
                        value = new_value
                    template.parameters[
                        f'{self.TEMPLATE_PARAM_PREFIX}{key}'] = value
            return str(template).replace(' |', '|')

        if details is None:
            details = self._details(fetch_from_api=fetch_from_api)
        for org, org_details in details.items():
            page = self._get_site_page(f'{self.NAMESPACE}:{org}')
            page_condition = page is not None
            if not force_create:
                page_condition = page_condition and page.exists
            if page_condition:
                page_text = page.text()
                page_text_leftovers = re.sub(
                    rf'{{{{{self.TEMPLATE_NAME}[^{{}}]+}}}}', '',
                    page_text).strip()
                new_template_text = template_text(org_details)
                new_page_text = f'{new_template_text}\n{page_text_leftovers}'
                page.edit(new_page_text)
                logger.debug(f'{page.name} updated')
        logger.debug('Done.')

    @_cli_command
    def delete_old(self, fetch_from_api=False, dry_run=False):
        """Delete old organizations (removed from apografi API).

        Args:
            fetch_from_api (bool): Whether to fetch new organization
                data from the API or read the most recently saved data.
            dry_run (bool): Whether to perform a dry run or do the actual
                deletion.
        """
        latest_org_names = [
            re.sub(' +', ' ', org['preferredLabel'].strip())
            for org in self._all(fetch_from_api=fetch_from_api)
        ]
        for org_page in self._all_pages_simple():
            org_page_title = org_page.page_title
            if org_page_title not in latest_org_names:
                if dry_run:
                    print(f'SHOULD BE DELETED: {org_page_title}')
                else:
                    reason = 'Παλιός φορέας (δεν υπάρχει πια στην Απογραφή)'
                    cat_str = ''
                    org_page.delete(reason=reason)
                    org_category_page = self._get_site_page(org_page_title,
                                                            is_category=True)
                    if org_category_page.exists:
                        org_category_page.delete(reason=reason)
                        cat_str = 'AND CATEGORY '
                    print(f'PAGE {cat_str}WAS DELETED: {org_page_title}')

    def units(self, name, unit_types=None):
        """Return the units of an organization.

        Args:
            name (string): The name of the organization.
            unit_types (list): A list of unit types, e.g. [4, 3, 41]

        Returns:
            list: The units of the organization. E.g.
            [
                {
                  "code": "100117",
                  "preferredLabel": "ΓΕΝΙΚΗ ΓΡΑΜΜΑΤΕΙΑ ΔΗΜΟΣΙΑΣ ΥΓΕΙΑΣ",
                  "unitType": 41
                },
                {
                  "code": "521532",
                  "preferredLabel": "ΓΕΝΙΚΗ ΓΡΑΜΜΑΤΕΙΑ ΥΠΗΡΕΣΙΩΝ ΥΓΕΙΑΣ",
                  "unitType": 41
                }
            ]
        """
        units = []

        def add_sub_unit(unit):
            children = unit.get('children')
            if children is not None:
                for child in children:
                    add_sub_unit(child)
                del unit['children']
            units.append(unit)

        org_code = self._code_by_name(name)
        if org_code is not None:
            org_tree = self._tree_by_code(org_code)
            if org_tree is not None:
                for org in org_tree.get('children', []):
                    add_sub_unit(org)
            if unit_types is not None:
                units = [
                    unit for unit in units if unit['unitType'] in unit_types
                ]
        result = units
        return result