def test_migrate_OP_VIP_datasets(self): OP_VIP_DATASETS = [] # OP_VIP_DATASETS.append('a572e5ec-0e81-42df-9dde-aad55a50bd44') #OP_VIP_DATASETS.append('db715fd8-0970-48bb-a1f4-6cb2bb10b36e') # OP_VIP_DATASETS.append('ed21b53a-e5ff-4077-8191-a4f107ebde6f') # OP_VIP_DATASETS.append('b941f99a-57da-4576-a544-4b8811acc327') # OP_VIP_DATASETS.append('150c8ae3-9d1f-4971-b23b-2129469abbb3') ea731c1b-422b-4b3c-a399-c400302a6c8b #OP_VIP_DATASETS.append('309e9d59-1c9c-4c79-8394-72bfd8dc7200') #OP_VIP_DATASETS.append('9a2ef9a0-b50e-448d-996d-577e892148e2') #OP_VIP_DATASETS.append('68c15f0f-c77b-42c0-b411-16fbce223932') # OP_VIP_DATASETS.append('54dd2284-52e8-4131-8b9c-3eebb1d88b38') #OP_VIP_DATASETS.append('e62c401b-e8d8-44c7-a758-abb78c2f62e6') #OP_VIP_DATASETS.append('9e8c7096-553d-40ac-9a74-4f01d552d583') OP_VIP_DATASETS.append('f3daf58c-3ab1-4fb9-8f68-33faf3f73625') packages_to_migrate = [] for dataset in OP_VIP_DATASETS: condition = Package.id == dataset package = find_any_in_database(TEST_CONFIG_FILE_PATH, condition, Package)[0] packages_to_migrate.append(package) controlled_vocabulary = ControlledVocabulary() for package in packages_to_migrate: datasets_migration_manager.migrate_package_to_virtuoso( config_file_path=TEST_CONFIG_FILE_PATH, package=package, controlled_vocabulary=controlled_vocabulary)
def test_migrate_3_most_viewed_packages_to_virtuoso(self): packages_to_migrate = [] condition = Package.id == TED_PACKAGE_ID ted_package = find_any_in_database(TEST_CONFIG_FILE_PATH, condition, Package)[0] condition = Package.id == DGT_TRANSLATION_PACKAGE_ID dgt_translation_package = find_any_in_database(TEST_CONFIG_FILE_PATH, condition, Package)[0] condition = Package.id == CORDISH2020PROJECTS_PACKAGE_ID cordisH2020projects_package = find_any_in_database( TEST_CONFIG_FILE_PATH, condition, Package)[0] packages_to_migrate.append(ted_package) packages_to_migrate.append(dgt_translation_package) packages_to_migrate.append(cordisH2020projects_package) controlled_vocabulary = ControlledVocabulary() for package in packages_to_migrate: datasets_migration_manager.migrate_package_to_virtuoso( config_file_path=TEST_CONFIG_FILE_PATH, package=package, controlled_vocabulary=controlled_vocabulary) dataset = DatasetDcatApOp(DATASET_URI_PREFIX + "ted-1") result = dataset.get_description_from_ts() assert result is True dataset = DatasetDcatApOp(DATASET_URI_PREFIX + "cordisH2020projects") result = dataset.get_description_from_ts() assert result is True dataset = DatasetDcatApOp(DATASET_URI_PREFIX + "dgt-translation-memory") result = dataset.get_description_from_ts() assert result is True
def test_migrate_dataset_in_multiple_groups(self): condition = Package.id == CONNECT_SPARQL_ENDPOINT_ID ecb_web_service_package = find_any_in_database(TEST_CONFIG_FILE_PATH, condition, Package)[0] controlled_vocabulary = ControlledVocabulary() datasets_migration_manager.migrate_package_to_virtuoso( config_file_path=TEST_CONFIG_FILE_PATH, package=ecb_web_service_package, controlled_vocabulary=controlled_vocabulary) dataset = DatasetDcatApOp(DATASET_URI_PREFIX + "connect-sparql-endpoint", graph_name=DCATAPOP_PRIVATE_GRAPH_NAME) result = dataset.get_description_from_ts() assert result is True
def test_migrate_dataset_in_group(self): condition = Package.id == ECB_WEB_SERVICE_PACKAGE_ID ecb_web_service_package = find_any_in_database(TEST_CONFIG_FILE_PATH, condition, Package)[0] controlled_vocabulary = ControlledVocabulary() datasets_migration_manager.migrate_package_to_virtuoso( config_file_path=TEST_CONFIG_FILE_PATH, package=ecb_web_service_package, controlled_vocabulary=controlled_vocabulary) dataset = DatasetDcatApOp(DATASET_URI_PREFIX + "ecb-web-service", graph_name=DCATAPOP_PRIVATE_GRAPH_NAME) result = dataset.get_description_from_ts() assert result is True
def test_migrate_most_viewed_package_to_virtuoso(self): controlled_vocabulary = ControlledVocabulary() condition = Package.id == TED_PACKAGE_ID ted_package = find_any_in_database(TEST_CONFIG_FILE_PATH, condition, Package)[0] datasets_migration_manager.migrate_package_to_virtuoso( config_file_path=TEST_CONFIG_FILE_PATH, package=ted_package, controlled_vocabulary=controlled_vocabulary) dataset = DatasetDcatApOp(DATASET_URI_PREFIX + "ted-1") result = dataset.get_description_from_ts() assert result is True
def test_migrate_100_packages_to_virtuoso(self): start = time.time() controlled_vocabulary = ControlledVocabulary() packages_to_migrate = postgresql_helper.get_all_active_packages( TEST_CONFIG_FILE_PATH)[:100] # type: list[Package] for package in packages_to_migrate: datasets_migration_manager.migrate_package_to_virtuoso( config_file_path=TEST_CONFIG_FILE_PATH, package=package, controlled_vocabulary=controlled_vocabulary) duration = time.time() - start log.info(duration)
# published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # # contact: <https://publications.europa.eu/en/web/about-us/contact> from ckanext.ecportal.migration.datasets_migration_manager import ControlledVocabulary controlled_vocabulary = ControlledVocabulary() FILE_FORMAT_MAPPING = { "interactive": "http://publications.europa.eu/resource/authority/file-type/DCR", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "http://publications.europa.eu/resource/authority/file-type/XLSX", "image/jpeg": "http://publications.europa.eu/resource/authority/file-type/JPEG", "application/x-e00": "http://publications.europa.eu/resource/authority/file-type/E00", "text/comma-separated-values": "http://publications.europa.eu/resource/authority/file-type/CSV", "text/n3": "http://publications.europa.eu/resource/authority/file-type/RDF_N_TRIPLES", "application/vnd.openxmlformats-officedocument.presentationml.presentation": "http://publications.europa.eu/resource/authority/file-type/PPTX", "application/ms-word": "http://publications.europa.eu/resource/authority/file-type/DOC", "application/vnd.ms-excel": "http://publications.europa.eu/resource/authority/file-type/XLS", "application/x-compress": "http://publications.europa.eu/resource/authority/file-type/TAR", "ZIP": "http://publications.europa.eu/resource/authority/file-type/ZIP", "application/x-mxd": "http://publications.europa.eu/resource/authority/file-type/MXD", "application/x-n3": "http://publications.europa.eu/resource/authority/file-type/RDF_TURTLE",
def convert_package_to_dataset(package=Package(), controlled_vocabulary=ControlledVocabulary(), configuration_file=CONFIGURATION_FILE_PATH): package_extra_list = \ retrieve_package_extra_list_from_postgres(configuration_file, package) # type: list[PackageExtra] tag_list = retrieve_tag_list_from_postgres(configuration_file, package) resource_list = retrieve_resource_list(configuration_file, package) dataset_uri = DATASET_URI_PREFIX + package.name dataset = DatasetDcatApOp(dataset_uri) dataset.graph_name = DCATAPOP_PUBLIC_GRAPH_NAME if package.private: dataset.graph_name = DCATAPOP_PRIVATE_GRAPH_NAME dataset.privacy_state = PRIVACY_STATE_PRIVATE dataset_schema = DatasetSchemaDcatApOp(dataset_uri, graph_name=dataset.graph_name) # 1...1 #dataset_schema.identifier_adms['0'] = SchemaGeneric(dataset_uri) dataset.schema_catalog_record = set_catalog_record(package, package_extra_list, dataset_schema) dataset_schema.versionInfo_owl['0'] = ResourceValue(package.version) #dataset_schema.isPartOfCatalog_dcatapop['0'] = CatalogSchemaDcatApOp(uri_util.new_cataloge_uri_from_title()) set_landing_page(dataset_schema, package) set_package_titles(configuration_file, dataset_schema, package) # 0...n set_package_descriptions(configuration_file, dataset_schema, package) # 0...n dataset_schema.ckanName_dcatapop['0'] = ResourceValue(package.name) # 1...1 dataset_schema.modified_dcterms['0'] = ResourceValue(str(package.metadata_modified)) groups = retrieve_groups(configuration_file, package) # To process only once the groups, multiple set are done once. set_publisher_and_theme_and_group(dataset_schema, groups, controlled_vocabulary.controlled_publishers) # 0...1 if not dataset_schema.publisher_dcterms.get('0', None): owner = model.Group.get(package.owner_org) if owner: dataset_schema.publisher_dcterms['0'] = AgentSchemaDcatApOp('http://publications.europa.eu/resource/authority/corporate-body/{0}'.format(owner.name.upper()), graph_name=dataset_schema.graph_name) else: log.warn('Dataset {0} has no publisher'.format(dataset_schema.uri)) #raise MigrationError(message='Dataset {0} has no publisher'.format(dataset_schema.uri)) for package_extra in package_extra_list: if package_extra.value: if package_extra.key == ACCRUAL_PERIODICITY: set_accrual_periodicity(dataset_schema, package_extra, controlled_vocabulary.controlled_frequencies) # 0...1 elif package_extra.key == TEMPORAL_COVERAGE_FROM: set_temporal(dataset_schema, package_extra) # 0...1 elif package_extra.key == TEMPORAL_COVERAGE_TO: set_temporal_to(dataset_schema, package_extra) # 0...1 elif package_extra.key == ALTERNATIVE_TITLE: set_alternative_titles(configuration_file, dataset_schema, package_extra) # 0...n elif package_extra.key == IDENTIFIER: set_identifier(dataset_schema, package_extra) # 0...n elif package_extra.key == METADATA_LANGUAGE: pass elif package_extra.key == CITATION: pass elif package_extra.key == RELEASE_DATE: #dataset_schema.issued_dcterms['0'] = ResourceValue(value_or_uri=str(package_extra.value), # datatype=NAMESPACE_DCATAPOP.xsd + DATE_TIME) # 0...1 pass elif package_extra.key == EVALUATION_DATE: pass elif package_extra.key == SOURCE: pass elif package_extra.key == ANALYST_IN_EXTRA_FIELD: pass elif package_extra.key == THIS_IS_EXTRA_FIELD: pass elif package_extra.key == MODIFIED_DATE: pass elif package_extra.key == KIC: pass elif package_extra.key == CLC: pass elif package_extra.key == DATA_SOURCE: pass elif package_extra.key == EIT: pass elif package_extra.key == 'version_description': set_version_note(dataset_schema, package_extra) controlled_status = "" for tag in tag_list: # type: Tag if tag.name: if not tag.vocabulary_id: # where voc = / set_keyword(dataset_schema, tag, configuration_file) # 0...n elif tag.vocabulary_id == VOC_LANGUAGE_ID: # where voc = language set_language(dataset_schema, tag, controlled_vocabulary.controlled_languages) # 0...n elif tag.vocabulary_id == VOC_GEO_COVERAGE: # where voc = geographical_coverage set_spatial(dataset_schema, tag, controlled_vocabulary.controlled_country) # 0...n elif tag.vocabulary_id == VOC_DATASET_TYPE: # where voc = dataset_type set_dataset_type(dataset_schema, tag) # 0...1 elif tag.vocabulary_id == VOC_CONCEPTS_EUROVOC: # where voc = concepts_eurovoc set_subject(dataset_schema, tag) # 0...1 elif tag.vocabulary_id == VOC_STATUS: # where voc = status package_status = tag.name # 0...1 if package_status: package_status_upper_case = package_status.split('/')[-1].upper() if package_status_upper_case == 'UNDERDEVELOPMENT': package_status_upper_case = 'DEVELOP' controlled_status = next( uri for uri, value in controlled_vocabulary.controlled_status.iteritems() if value == package_status_upper_case) # TODO no property for that in new ontology # elif tag.vocabulary_id == '0311e5a2-c6a0-49c7-84cc-1ceec129fd7c': # where voc = interoperability_level # TODO verify this field dataset_schema.issued_dcterms['0'] = ResourceValue(str(get_metadata_created_timestamp(package.id)), datatype=NAMESPACE_DCATAPOP.xsd + DATE_TIME) # 0...1 for resource in resource_list: type = resource.resource_type or resource.extras if MAIN_DOCUMENTATION in type \ or RELATED_DOCUMENTATION in type \ or WEB_RELATED_DOCUMENTATION in type: set_document(configuration_file, dataset_schema, resource, controlled_vocabulary.controlled_file_types, controlled_vocabulary.controlled_documentation_types) # 0...n else: set_distribution(configuration_file, dataset_schema, resource, controlled_status, controlled_vocabulary.controlled_file_types, controlled_vocabulary.controlled_distribution_types) set_contact_point(dataset_schema, package_extra_list) dataset.schema = dataset_schema return dataset