def test_run_failing_task(testpath, ): """Test running task that fails. Executes FailingTestTask and checks that report of failed event is added to mongo document. :param testpath: temporary directory :returns: ``None`` """ # Run task like it would be run from command line run_luigi_task('FailingTestTask', testpath) # Check that new event is added to workflow database conf = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE) mongoclient = pymongo.MongoClient(host=conf.get('mongodb_host')) collection = (mongoclient[conf.get('mongodb_database')][conf.get( 'mongodb_collection')]) document = collection.find_one() # Check 'messages' field assert document['workflow_tasks']['FailingTestTask']['messages'] ==\ 'An error occurred while running a test task: Shit hit the fan' # Check 'result' field assert document['workflow_tasks']['FailingTestTask']['result'] ==\ 'failure' # Parse the 'timestamp' field to make sure it is correct format datetime.datetime.strptime( document['workflow_tasks']['FailingTestTask']['timestamp'], '%Y-%m-%dT%H:%M:%S.%f') # Check that there is no extra documents in mongo collection assert collection.count() == 1
def download_file(file_metadata, linkpath="", config_file="/etc/siptools_research.conf", upload_database=None): """Get file from IDA or upload-rest-api and create a hard link to linkpath. :param file_metadata: File metadata from Metax :param linkpath: Path where the hard link is created :param config_file: Configuration file :param upload_database: upload_rest_api.database.Database object :returns: ``None`` """ conf = Configuration(config_file) pas_storage_id = conf.get("pas_storage_id") file_storage = file_metadata["file_storage"]["identifier"] if file_storage == pas_storage_id: if upload_database is None: raise ValueError("upload_database parameter required") filepath = _get_local_file(file_metadata, upload_database, conf) else: filepath = _get_ida_file(file_metadata, conf) if linkpath: os.link(filepath, linkpath)
def run(self): """Create premis events. Reads dataset metadata from Metax and creates premis event XML files. Premis event XML files are written to SIP creation directory and premis event reference file is written to workspace directory. :returns: ``None`` """ config_object = Configuration(self.config) tmp = os.path.join(config_object.get('packaging_root'), 'tmp/') with TemporaryDirectory(prefix=tmp) as temporary_workspace: _create_premis_events(self.dataset_id, temporary_workspace, self.config) # Move PREMIS event files to SIP creation path when all of # them are succesfully created to avoid atomicity problems. # PREMIS event reference file is moved to ouput target path # after other files are moved to SIP creation directory. with self.output().temporary_path() as target_path: shutil.move( os.path.join(temporary_workspace, 'premis-event-md-references.jsonl'), target_path) for file_ in os.listdir(temporary_workspace): shutil.move(os.path.join(temporary_workspace, file_), self.sip_creation_path)
def run(self): """Send SIP file to DP service using sftp. :returns: ``None`` """ # Read host/user/ssh_key_path from onfiguration file conf = Configuration(self.config) # Init SFTP connection with paramiko.SSHClient() as ssh: ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect(conf.get('dp_host'), username=conf.get('dp_user'), key_filename=conf.get('dp_ssh_key')) with ssh.open_sftp() as sftp: # Copy tar to remote host. Validation workflow starts # when ".incomplete" suffix is removed from target file # path. tar_file = os.path.basename(self.workspace) + '.tar' sftp.put(os.path.join(self.workspace, tar_file), os.path.join('transfer', tar_file + '.incomplete'), confirm=False) sftp.rename(os.path.join('transfer', tar_file + '.incomplete'), os.path.join('transfer', tar_file)) with self.output().open('w') as log: log.write('Dataset id=' + self.dataset_id)
def test_run_workflowtask(testpath): """Test WorkflowTask execution. Executes a TestTask, checks that output file is created, checks that new event field is created to mongo document. :param testpath: temporary directory :returns: ``None`` """ # Run task like it would be run from command line run_luigi_task('TestTask', testpath) # Check that output file is created with open(os.path.join(testpath, 'output_file')) as output: assert output.read() == 'Hello world' # Check that new event is added to workflow database conf = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE) mongoclient = pymongo.MongoClient(host=conf.get('mongodb_host')) collection = (mongoclient[conf.get('mongodb_database')][conf.get( 'mongodb_collection')]) document = collection.find_one() # Check 'messages' field assert document['workflow_tasks']['TestTask']['messages'] ==\ 'Test task was successfull' # Check 'result' field assert document['workflow_tasks']['TestTask']['result'] == 'success' # Parse the 'timestamp' field to make sure it is correct format datetime.datetime.strptime( document['workflow_tasks']['TestTask']['timestamp'], '%Y-%m-%dT%H:%M:%S.%f') # Check that there is no extra documents in mongo collection assert collection.count() == 1
def app(request, test_config): """Create web app and Mock Metax HTTP responses. :returns: An instance of the REST API web app. """ # Create app and change the default config file path app_ = create_app() app_.config.update( SIPTOOLS_RESEARCH_CONF=test_config ) app_.config['TESTING'] = True # Create temporary directories conf = Configuration(test_config) cache_dir = os.path.join(conf.get("packaging_root"), "file_cache") os.mkdir(cache_dir) tmp_dir = os.path.join(conf.get("packaging_root"), "tmp") os.mkdir(tmp_dir) # Mock Metax def _fin(): httpretty.reset() httpretty.disable() httpretty.enable() request.addfinalizer(_fin) mock_metax() return app_
def __init__(self, *args, **kwargs): """Initialize Task.""" super(CreateTechnicalMetadata, self).__init__(*args, **kwargs) self.config_object = Configuration(self.config) self.metax_client = Metax( self.config_object.get('metax_url'), self.config_object.get('metax_user'), self.config_object.get('metax_password'), verify=self.config_object.getboolean('metax_ssl_verification'))
def __init__(self, config_file): """Initialize new pymongo client if it does not exist already (singleton design pattern) :param config_file: path to configurtion file """ if self._collection is None: conf = Configuration(config_file) self._client = pymongo.MongoClient(host=conf.get("mongodb_host"), port=27017) self._collection = (self._client[conf.get("mongodb_database")][ conf.get("mongodb_collection")])
def test_ida_download(testpath): """Add test dataset metadata and associated file metadata to Metax. Run partial workflow by calling CreateMets task with luigi. :param testpath: temporary directory fixture """ # Read configuration file conf = Configuration(tests.conftest.TEST_CONFIG_FILE) # Override Ida password in configuration file with real password from # the user # pylint: disable=protected-access conf._parser.set( 'siptools_research', 'ida_password', getpass.getpass(prompt='Ida password for user \'testuser_1\':')) # Download a file that is should be available download_path = os.path.join(testpath, 'ida_file') download_file( { 'file_path': '/file', 'identifier': 'pid:urn:111', 'file_storage': { 'identifier': 'urn:nbn:fi:att:file-storage-ida' } }, download_path, tests.conftest.TEST_CONFIG_FILE) # Check contents of downloaded file with open(download_path) as open_file: assert open_file.read() == 'foo\n'
def test_workflow(testpath, module_name, task, requests_mock): """Test workflow dependency tree. Run a task (and all tasks it requires) and check that report of successful task is added to mongodb. :param testpath: temporary directory :param module_name: submodule of siptools_research.workflow that contains Task to be tested :param task: Task class name :param requests_mock: Mocker object :returns: ``None`` """ tests.utils.add_metax_dataset(requests_mock, tests.metax_data.datasets.BASE_DATASET, files=[tests.metax_data.files.TXT_FILE]) requests_mock.get('https://ida.test/files/pid:urn:identifier/download', text='foo') # Init pymongo client conf = Configuration(tests.conftest.TEST_CONFIG_FILE) mongoclient = pymongo.MongoClient(host=conf.get('mongodb_host')) with mock.patch.object(RemoteAnyTarget, '_exists', _mock_exists): workspace = os.path.join(testpath, 'workspaces', 'workspace') module = importlib.import_module('siptools_research.workflow.' + module_name) task_class = getattr(module, task) luigi.build( [task_class( workspace=workspace, dataset_id='dataset_identifier', config=tests.conftest.UNIT_TEST_CONFIG_FILE )], local_scheduler=True ) collection = (mongoclient[conf.get('mongodb_database')] [conf.get('mongodb_collection')]) document = collection.find_one() # Check 'result' field assert document['workflow_tasks'][task]['result'] == 'success' if module_name == "cleanup": assert document["completed"]
def clean_file_cache(config_file): """Remove all files from <packaging_root>/file_cache that have not been accessed in two weeks. :returns: ``None`` """ conf = Configuration(config_file) files_path = os.path.join(conf.get("packaging_root"), "file_cache") current_time = time.time() time_lim = 60 * 60 * 24 * 14 # Remove all old files for dirpath, _, files in os.walk(files_path): for filename in files: filepath = os.path.join(dirpath, filename) last_access = os.stat(filepath).st_atime if current_time - last_access > time_lim: os.remove(filepath)
def run(self): """Sign METS document. :returns: ``None`` """ signature = dpres_signature.signature.create_signature( self.workspace, Configuration(self.config).get("sip_sign_key"), ['mets.xml']) with self.output().open('wb') as signature_file: signature_file.write(signature)
def output(self): """Return the output target of this Task. :returns: remote target that may exist on digital preservation server in any path formatted:: ~/accepted/<datepath>/<document_id>.tar/ ~/rejected/<datepath>/<document_id>.tar/ where datepath is any date between the date the SIP was sent to the server and the current date. :rtype: RemoteAnyTarget """ conf = Configuration(self.config) database = Database(self.config) # Get SendSIPToDP completion datetime or use the current UTC # time. This is necessary since ValidateSip output is checked # first time before any of the dependencies are ran. # Dependencies are ran only if ValidateSip task is not # completed. try: send_timestamp = database.get_event_timestamp( self.document_id, "SendSIPToDP") sip_to_dp_date = dateutil.parser.parse(send_timestamp).date() except (ValueError, KeyError): sip_to_dp_date = datetime.utcnow().date() lim_date = datetime.today().date() path = [] while sip_to_dp_date <= lim_date: path.append('accepted/%s/%s.tar' % (sip_to_dp_date, self.document_id)) path.append('rejected/%s/%s.tar' % (sip_to_dp_date, self.document_id)) sip_to_dp_date += timedelta(days=1) return RemoteAnyTarget(path, conf.get('dp_host'), conf.get('dp_user'), conf.get('dp_ssh_key'))
def test_verify_file_contained_by_dataset_files(): """Test is_consistent_for_file method. Check that ``DatasetConsistency::is_consistent_for_file()`` succeeds when dataset files contains the file :returns: ``None`` """ # Init metax client configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE) client = Metax( configuration.get('metax_url'), configuration.get('metax_user'), configuration.get('metax_password'), verify=configuration.getboolean('metax_ssl_verification') ) dataset = { 'identifier': 'dataset_identifier', 'research_dataset': {'files': [ {'identifier': 'file_identifier'} ], 'directories': []} } file_metadata = { 'identifier': 'file_identifier', 'file_path': "/path/to/file", 'parent_directory': {'identifier': 'parent_directory_identifier'} } try: dirs = DatasetConsistency(client, dataset) dirs.is_consistent_for_file(file_metadata) except InvalidDatasetMetadataError as exc: pytest.fail( '_verify_file_contained_by_dataset raised exception: ' + str(exc) )
def test_validate_datacite(requests_mock): """Test _validate_datacite. Function should raises exception with readable error message when datacite XML contains multiple errors. :param requests_mock: Mocker object :returns: ``None`` """ # Init metax client configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE) metax_client = Metax( configuration.get('metax_url'), configuration.get('metax_user'), configuration.get('metax_password'), verify=configuration.getboolean('metax_ssl_verification')) requests_mock.get( "https://metaksi/rest/v1/datasets/dataset_identifier?" "dataset_format=datacite", content=get_very_invalid_datacite()) # Try to validate datacite expected_error = "Datacite metadata is invalid:" # pylint: disable=protected-access with pytest.raises(InvalidDatasetMetadataError, match=expected_error): metadata_validator._validate_datacite('dataset_identifier', metax_client)
def test_validate_file_metadata_invalid_metadata(requests_mock): """Test ``_validate_file_metadata``. Function should raise exceptions with descriptive error messages. :param requests_mock: Mocker object :returns: ``None`` """ file_metadata = copy.deepcopy(BASE_FILE) file_metadata['file_characteristics'] = { "file_created": "2014-01-17T08:19:31Z" } tests.utils.add_metax_dataset(requests_mock, files=[file_metadata]) # Init metax client configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE) client = Metax(configuration.get('metax_url'), configuration.get('metax_user'), configuration.get('metax_password'), verify=configuration.getboolean('metax_ssl_verification')) expected_error = ( "Validation error in metadata of path/to/file: 'file_format' is" " a required property\n\nFailed validating 'required' in schema") with pytest.raises(InvalidFileMetadataError, match=expected_error): # pylint: disable=protected-access siptools_research.metadata_validator._validate_file_metadata( {'identifier': 'dataset_identifier'}, client, configuration)
def test_validate_file_metadata(requests_mock): """Test _validate_file_metadata. Check that dataset directory caching is working correctly in DatasetConsistency when the files have common root directory in dataset.directories property. :param requests_mock: Mocker object :returns: ``None`` """ dataset = copy.deepcopy(BASE_DATASET) dataset['research_dataset']['directories'] = [{'identifier': 'root_dir'}] file_1 = copy.deepcopy(TXT_FILE) file_1['identifier'] = 'file_identifier1' file_2 = copy.deepcopy(TXT_FILE) file_2['identifier'] = 'file_identifier2' requests_mock.get(tests.conftest.METAX_URL + '/directories/pid:urn:dir:wf1', json={ 'identifier': 'first_par_dir', 'directory_path': '', 'parent_directory': { 'identifier': 'second_par_dir' } }, status_code=200) requests_mock.get(tests.conftest.METAX_URL + '/directories/second_par_dir', json={ 'identifier': 'second_par_dir', 'directory_path': '', 'parent_directory': { 'identifier': 'root_dir' } }, status_code=200) requests_mock.get(tests.conftest.METAX_URL + '/directories/root_dir', json={ 'identifier': 'second_par_dir', 'directory_path': '/' }, status_code=200) files_adapter = requests_mock.get(tests.conftest.METAX_URL + '/datasets/dataset_identifier/files', json=[file_1, file_2], status_code=200) # Init metax client configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE) client = Metax(configuration.get('metax_url'), configuration.get('metax_user'), configuration.get('metax_password'), verify=configuration.getboolean('metax_ssl_verification')) # pylint: disable=protected-access siptools_research.metadata_validator._validate_file_metadata( dataset, client, configuration) assert files_adapter.call_count == 1
def find_file_categories(self): """Create logical structure map of dataset files. Returns dictionary with filecategories as keys and filepaths as values. :returns: logical structure map dictionary """ config_object = Configuration(self.config) metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) dataset_files = metax_client.get_dataset_files(self.dataset_id) dataset_metadata = metax_client.get_dataset(self.dataset_id) languages = get_dataset_languages(dataset_metadata) dirpath2usecategory = get_dirpath_dict(metax_client, dataset_metadata) logical_struct = dict() for dataset_file in dataset_files: file_id = dataset_file['identifier'] # Get the use category of file. The path to the file in # logical structmap is stored in 'use_category' in metax. filecategory = find_file_use_category(file_id, dataset_metadata) # If file listed in datasets/<id>/files is not listed in # 'files' section of dataset metadata, look for # parent_directory of the file from 'directories' section. # The "use_category" of file is the "use_category" of the # parent directory. if filecategory is None: name_len = len(dataset_file["file_name"]) filecategory = find_dir_use_category( dataset_file["file_path"][:-name_len], dirpath2usecategory, languages) # If file category was not found even for the parent # directory, raise error if filecategory is None: raise InvalidDatasetMetadataError( "File category for file {} was not found".format(file_id)) # Append path to logical_struct[filecategory] list. Create # list if it does not exist already if filecategory not in logical_struct.keys(): logical_struct[filecategory] = [] logical_struct[filecategory].append(dataset_file['file_path']) return logical_struct
def run(self): """Get dataset 1 from Metax. :returns: ``None`` """ config_object = Configuration(self.config) metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) metax_client.get_dataset('1')
def test_directory_validation_caching_works(requests_mock): """Test directory validation caching. Two files are contained by same directory. Metax is called only once for each directory in tree and hence the directory validation as well. :returns: ``None`` """ # Init metax client configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE) client = Metax(configuration.get('metax_url'), configuration.get('metax_user'), configuration.get('metax_password'), verify=configuration.getboolean('metax_ssl_verification')) first_par_dir_adapter = requests_mock.get( tests.conftest.METAX_URL + '/directories/first_par', json={ 'identifier': 'first_par', 'directory_path': '/second_par/first_par', 'parent_directory': { 'identifier': 'second_par' } }, status_code=200) second_par_dir_adapter = requests_mock.get( tests.conftest.METAX_URL + '/directories/second_par', json={ 'identifier': 'second_par', 'directory_path': '/second_par', 'parent_directory': { 'identifier': 'root' } }, status_code=200) root_dir_adapter = requests_mock.get(tests.conftest.METAX_URL + '/directories/root', json={ 'identifier': 'root', 'directory_path': '/' }, status_code=200) file2_metadata = copy.deepcopy(FILE_METADATA) file2_metadata['file_path'] = ["/path/to/file2"] try: validator = DirectoryValidation(client) validator.is_valid_for_file(FILE_METADATA) validator.is_valid_for_file(file2_metadata) except InvalidDatasetMetadataError as exc: pytest.fail('test_successful_directory_validation fails: ' + str(exc)) # verify that metax is called only once for directories assert first_par_dir_adapter.call_count == 1 assert second_par_dir_adapter.call_count == 1 assert root_dir_adapter.call_count == 1
def _create_premis_events(dataset_id, workspace, config): """Create premis events from provenance metadata. Reads dataset provenance metadata from Metax. For each provenance object a METS document that contains a PREMIS event element is created. :param dataset_id: dataset identifier :param workspace: SIP creation directory :param config: path to configuration file :returns: ``None`` """ config_object = Configuration(config) metadata = Metax(config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean( 'metax_ssl_verification')).get_dataset(dataset_id) dataset_languages = get_dataset_languages(metadata) provenances = metadata["research_dataset"]["provenance"] for provenance in provenances: event_type = get_localized_value( provenance["preservation_event"]["pref_label"], languages=dataset_languages) try: event_datetime = provenance["temporal"]["start_date"] except KeyError: event_datetime = 'OPEN' event_detail = get_localized_value(provenance["description"], languages=dataset_languages) event_outcome = get_localized_value( provenance["event_outcome"]["pref_label"], languages=dataset_languages) event_outcome_detail = get_localized_value( provenance["outcome_description"], languages=dataset_languages) premis_event.premis_event(workspace=workspace, event_type=event_type, event_datetime=event_datetime, event_detail=event_detail, event_outcome=event_outcome, event_outcome_detail=event_outcome_detail)
def test_successful_directory_validation_fails(requests_mock): """Test validation of invalid directory tree. The root directory is missing the `directory_path` attribute :returns: ``None`` """ # Init metax client configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE) client = Metax(configuration.get('metax_url'), configuration.get('metax_user'), configuration.get('metax_password'), verify=configuration.getboolean('metax_ssl_verification')) first_par_dir_adapter = requests_mock.get( tests.conftest.METAX_URL + '/directories/first_par', json={ 'identifier': 'first_par', 'directory_path': '/second_par/first_par', 'parent_directory': { 'identifier': 'second_par' } }, status_code=200) second_par_dir_adapter = requests_mock.get( tests.conftest.METAX_URL + '/directories/second_par', json={ 'identifier': 'second_par', 'directory_path': '/second_par', 'parent_directory': { 'identifier': 'root' } }, status_code=200) root_dir_adapter = requests_mock.get(tests.conftest.METAX_URL + '/directories/root', json={'identifier': 'root'}, status_code=200) with pytest.raises(InvalidDatasetMetadataError) as exc_info: validator = DirectoryValidation(client) validator.is_valid_for_file(FILE_METADATA) assert str(exc_info.value).startswith( "Validation error in metadata of root: " "'directory_path' is a required property") assert first_par_dir_adapter.call_count == 1 assert second_par_dir_adapter.call_count == 1 assert root_dir_adapter.call_count == 1
def test_successful_directory_validation(requests_mock): """Directory validation of valid directory tree. :returns: ``None`` """ # Init metax client configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE) client = Metax(configuration.get('metax_url'), configuration.get('metax_user'), configuration.get('metax_password'), verify=configuration.getboolean('metax_ssl_verification')) first_par_dir_adapter = requests_mock.get( tests.conftest.METAX_URL + '/directories/first_par', json={ 'identifier': 'first_par', 'directory_path': '/second_par/first_par', 'parent_directory': { 'identifier': 'second_par' } }, status_code=200) second_par_dir_adapter = requests_mock.get( tests.conftest.METAX_URL + '/directories/second_par', json={ 'identifier': 'second_par', 'directory_path': '/second_par', 'parent_directory': { 'identifier': 'root' } }, status_code=200) root_dir_adapter = requests_mock.get(tests.conftest.METAX_URL + '/directories/root', json={ 'identifier': 'root', 'directory_path': '/' }, status_code=200) try: validator = DirectoryValidation(client) validator.is_valid_for_file(FILE_METADATA) except InvalidDatasetMetadataError as exc: pytest.fail('test_successful_directory_validation fails: ' + str(exc)) assert first_par_dir_adapter.call_count == 1 assert second_par_dir_adapter.call_count == 1 assert root_dir_adapter.call_count == 1
def run(self): """Copy datacite.xml metadatafile from Metax. Creates a METS document that contains dmdSec element with datacite metadata. :returns: ``None`` """ # Get datacite.xml from Metax config_object = Configuration(self.config) metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) dataset = metax_client.get_dataset(self.dataset_id) datacite = metax_client.get_datacite(dataset['identifier']) # Write datacite.xml to file datacite_path = os.path.join(self.workspace, 'datacite.xml') datacite.write(datacite_path) tmp = os.path.join(config_object.get('packaging_root'), 'tmp/') with TemporaryDirectory(prefix=tmp) as temporary_workspace: # Create output files with siptools import_description.import_description( dmdsec_location=datacite_path, workspace=temporary_workspace, without_uuid=True) # Move created files to SIP creation directory. PREMIS event # reference file is moved to output target path after # everything else is done. with self.output().temporary_path() as target_path: shutil.move( os.path.join(temporary_workspace, 'premis-event-md-references.jsonl'), target_path) for file_ in os.listdir(temporary_workspace): shutil.move(os.path.join(temporary_workspace, file_), self.sip_creation_path)
def get_provenance_ids(self): """List identifiers of provenance events. Gets list of dataset provenance events from Metax, and reads provenance IDs of the events from event.xml files found in the workspace directory. :returns: list of provenance IDs """ config_object = Configuration(self.config) metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) metadata = metax_client.get_dataset(self.dataset_id) languages = get_dataset_languages(metadata) # Get the reference file path from Luigi task input # It already contains the workspace path. event_ids = get_md_references( read_md_references( self.workspace, os.path.basename( self.input()['create_provenance_information'].path))) event_type_ids = {} for event_id in event_ids: event_file = event_id[1:] + "-PREMIS%3AEVENT-amd.xml" event_file_path = os.path.join(self.sip_creation_path, event_file) if not os.path.exists(event_file_path): continue root = ET.parse(encode_path(event_file_path)).getroot() event_type = root.xpath("//premis:eventType", namespaces=NAMESPACES)[0].text event_type_ids[event_type] = event_id provenance_ids = [] for provenance in metadata["research_dataset"]["provenance"]: event_type = get_localized_value( provenance["preservation_event"]["pref_label"], languages=languages) provenance_ids += [event_type_ids[event_type]] return provenance_ids
def test_send_sip(testpath): """Test the SendSipToDP task. Run task and check that .tar is copied to digital preservation server. This test uses real DPS test instance. To run this test, a valid ssh key for `tpas` user on `customer-test-tpas-1` must be copied to current working directory and renamed as `ssh_key`. :param testpath: Temporary directory fixture :returns: ``None`` """ # Create workspace with directories and files required by the task workspace = testpath os.makedirs(os.path.join(workspace, 'sip-in-progress')) tar_file_name = os.path.basename(workspace) + ".tar" shutil.copy('tests/data/testsips/simple_sip.tar', os.path.join(workspace, tar_file_name)) # Init sftp connection to digital preservation server for instant # verification later on conf = Configuration(tests.conftest.TEST_CONFIG_FILE) ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect(conf.get('dp_host'), username=conf.get('dp_user'), key_filename=conf.get('dp_ssh_key')) sftp = ssh.open_sftp() # Init and run task task = SendSIPToDP(workspace=workspace, dataset_id='1', config=tests.conftest.TEST_CONFIG_FILE) task.run() assert task.complete() # Check that tar-file is created on remote host. # NOTE: Tar is copied to ~/transfer/. From there it is automatically # moved to /var/spool/preservation/ and after validation it is moved # to ~/rejected/<datedir>/<workspace>.tar/. There is a risk that # file is moved from ~/transfer before this test is finished. target_file_path = "transfer/" + tar_file_name logging.debug('Looking for file: %s on server: %s', target_file_path, conf.get('dp_host')) sftp.stat(target_file_path)
def get_identifiers(self): """Get file identifiers. Return a list of all the file identifiers and the path to the downloaded files. :returns: Tuple (list of identifiers, cache_path) """ config_object = Configuration(self.config) packaging_root = config_object.get("packaging_root") cache_path = os.path.join(packaging_root, "file_cache") metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) try: dataset_files = metax_client.get_dataset_files(self.dataset_id) return [_file["identifier"] for _file in dataset_files], cache_path except DatasetNotAvailableError: return [], cache_path
def run(self): """Read list of required files from Metax and download them. Files are written to path based on ``file_path`` in Metax. :returns: ``None`` """ upload_database = upload_rest_api.database.Database() # Find file identifiers from Metax dataset metadata. config_object = Configuration(self.config) metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) dataset_files = metax_client.get_dataset_files(self.dataset_id) # Download files to temporary directory which will be moved to # output target path when all files have been downloaded with self.output().temporary_path() as temporary_directory: os.mkdir(temporary_directory) for dataset_file in dataset_files: identifier = dataset_file["identifier"] # Full path to file target_path = os.path.normpath( os.path.join(temporary_directory, dataset_file["file_path"].strip('/'))) if not target_path.startswith(temporary_directory): raise InvalidFileMetadataError( 'The file path of file %s is invalid: %s' % (identifier, dataset_file["file_path"])) # Create the download directory for file if it does not # exist already if not os.path.isdir(os.path.dirname(target_path)): # TODO: Use exist_ok -parameter when moving to # python3 os.makedirs(os.path.dirname(target_path)) download_file(dataset_file, target_path, self.config, upload_database)
def test_verify_file_contained_by_dataset_missing_from_dataset(requests_mock): """Test is_consistent_for_file method. Check that ``DatasetConsistency::is_consistent_for_file()`` raises exception with descriptive error messages when dataset files nor directories do not contain the file. :returns: ``None`` """ # Init metax client configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE) client = Metax( configuration.get('metax_url'), configuration.get('metax_user'), configuration.get('metax_password'), verify=configuration.getboolean('metax_ssl_verification') ) dataset = { 'identifier': 'dataset_identifier', 'research_dataset': { 'files': [], 'directories': [] } } file_metadata = { 'identifier': 'file_identifier', 'file_path': "/path/to/file", 'parent_directory': { 'identifier': 'parent_directory_identifier' } } requests_mock.get( tests.conftest.METAX_URL + '/directories/parent_directory_identifier', json={'identifier': 'parent_directory_identifier'}, status_code=200 ) with pytest.raises(InvalidDatasetMetadataError) as exc_info: dirs = DatasetConsistency(client, dataset) dirs.is_consistent_for_file(file_metadata) assert str(exc_info.value) == ("File not found from dataset files nor " "directories: /path/to/file")
def run(self): """Report preservation status to Metax. Checks the path of ingest report file in digital preservation service. If the ingest report is in ~/accepted/.../ directory, the dataset has passed validation.If the report is found in ~/rejected/.../ directory, or somewhere else, an exception is risen. The event handlers will deal with the exceptions. :returns: ``None`` """ # List of all matching paths ValidateSIP found ingest_report_paths = self.input()[0].existing_paths() # Only one ingest report should be found assert len(ingest_report_paths) == 1 # 'accepted' or 'rejected'? directory = ingest_report_paths[0].split('/')[0] if directory == 'accepted': # Init metax config_object = Configuration(self.config) metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) # Set Metax preservation state of this dataset to 6 ("in # longterm preservation") metax_client.set_preservation_state( self.dataset_id, state=DS_STATE_IN_DIGITAL_PRESERVATION, system_description='Accepted to preservation') with self.output().open('w') as output: output.write('Dataset id=' + self.dataset_id) elif directory == 'rejected': # Raise exception that informs event handler that dataset # did not pass validation raise InvalidSIPError("SIP was rejected") else: raise ValueError('Report was found in incorrect ' 'path: %s' % ingest_report_paths[0])