def test_SIP_files(db): """Test the files methods of API SIP.""" # we create a SIP model sip = SIP_.create() db.session.commit() # We create an API SIP on top of it api_sip = SIP(sip) assert len(api_sip.files) == 0 # we setup a file storage tmppath = tempfile.mkdtemp() db.session.add(Location(name='default', uri=tmppath, default=True)) db.session.commit() # we create a file content = b'test lol\n' bucket = Bucket.create() obj = ObjectVersion.create(bucket, 'test.txt', stream=BytesIO(content)) db.session.commit() # we attach it to the SIP sf = api_sip.attach_file(obj) db.session.commit() assert len(api_sip.files) == 1 assert api_sip.files[0].filepath == 'test.txt' assert sip.sip_files[0].filepath == 'test.txt' # finalization rmtree(tmppath)
def archive_sip(sip_uuid): """Send the SIP for archiving. Retries every 4 hours, six times, which should work for up to 24 hours archiving system downtime. :param sip_uuid: UUID of the SIP for archiving. :type sip_uuid: str """ try: sip = SIPApi(SIP.query.get(sip_uuid)) archiver = BagItArchiver(sip) bagmeta = archiver.get_bagit_metadata(sip) if bagmeta is None: raise ArchivingError( 'Bagit metadata does not exist for SIP: {0}.'.format(sip.id)) if sip.archived: raise ArchivingError( 'SIP was already archived {0}.'.format(sip.id)) archiver.write_all_files() sip.archived = True db.session.commit() except Exception as exc: # On ArchivingError (see above), do not retry, but re-raise if not isinstance(exc, ArchivingError): archive_sip.retry(exc=exc) raise
def test_SIP_create(app, db, mocker): """Test the create method from SIP API.""" # we setup a file storage tmppath = tempfile.mkdtemp() db.session.add(Location(name='default', uri=tmppath, default=True)) db.session.commit() # we create a file content = b'test lol\n' bucket = Bucket.create() obj = ObjectVersion.create(bucket, 'test.txt', stream=BytesIO(content)) db.session.commit() files = [obj] # setup metadata mjson = SIPMetadataType(title='JSON Test', name='json-test', format='json', schema='url') marcxml = SIPMetadataType(title='MARC XML Test', name='marcxml-test', format='xml', schema='uri') db.session.add(mjson) db.session.add(marcxml) metadata = { 'json-test': json.dumps({ 'this': 'is', 'not': 'sparta' }), 'marcxml-test': '<record></record>' } # Let's create a SIP user = create_test_user('*****@*****.**') agent = {'email': '*****@*****.**', 'ip_address': '1.1.1.1'} sip = SIP.create(True, files=files, metadata=metadata, user_id=user.id, agent=agent) db.session.commit() assert SIP_.query.count() == 1 assert len(sip.files) == 1 assert len(sip.metadata) == 2 assert SIPFile.query.count() == 1 assert SIPMetadata.query.count() == 2 assert sip.user.id == user.id assert sip.agent == agent # we mock the user and the agent to test if the creation works app.config['SIPSTORE_AGENT_JSONSCHEMA_ENABLED'] = False mock_current_user = mocker.patch('invenio_sipstore.api.current_user') type(mock_current_user).is_anonymous = mocker.PropertyMock( return_value=True) sip = SIP.create(True, files=files, metadata=metadata) assert sip.model.user_id is None assert sip.user is None assert sip.agent == {} # finalization rmtree(tmppath)
def oais_start_transfer(uuid, accession_id='', archivematica_id=None): """Archive a sip. This function should be called to start a transfer to archive a sip. Once the transfer is finished, you should call :py:func:`invenio_archivematica.tasks.oais_finish_transfer`. The signal :py:data:`invenio_archivematica.signals.oais_transfer_started` is called with the sip as function parameter. :param str uuid: the UUID of the sip to archive :param str accession_id: the AIP accession ID. You can generate one from :py:func:`invenio_archivematica.factories.create_accession_id` """ # we get the sip sip = SIP.get_sip(uuid) # we register the sip as being processed ark = Archive.get_from_sip(uuid) if not ark: ark = Archive.create(sip.model) ark.accession_id = accession_id ark.status = ArchiveStatus.WAITING # we start the transfer imp = current_app.config['ARCHIVEMATICA_TRANSFER_FACTORY'] transfer = import_string(imp) ret = transfer(sip.id, current_app.config['ARCHIVEMATICA_TRANSFER_FOLDER']) if ret == 0: db.session.commit() oais_transfer_started.send(sip) return oais_fail_transfer(uuid, accession_id)
def test_Archive(db): """Test the Archive model class.""" assert Archive.query.count() == 0 # we create an SIP, it will automatically create an Archive via signals user = create_test_user('*****@*****.**') sip = SIP.create(True, user_id=user.id, agent={'test': 'test'}) db.session.commit() assert Archive.query.count() == 1 ark = Archive.get_from_sip(sip.id) assert ark.sip.user.id == sip.user.id assert ark.status == ArchiveStatus.NEW assert ark.accession_id is None assert ark.archivematica_id is None # let's change the object ark.status = ArchiveStatus.REGISTERED ark.accession_id = '08' ark.archivematica_id = sip.id db.session.commit() ark = Archive.get_from_accession_id('08') assert Archive.query.count() == 1 assert ark.status == ArchiveStatus.REGISTERED assert ark.archivematica_id == sip.id # we try to get a non existing record assert Archive.get_from_sip(uuid.uuid4()) is None
def transfer_demo(uuid, config): """Transfer the files contained in the sip to the destination. Very similar to the rsync transfer. However, because of time, I use the VERY UNSECURE sshpass package for rsync authentication. DO NOT USE IN PROD!!! :param str uuid: the id of the sip containing files to transfer :param dict config: here config must be a dict with the following keys: - user - the SSH user - password_file - a path where the password is stored - remote - the URL or IP of the remote - remote_path - where to store files on the remote - args - the args for rsync """ # we retrieve the archive and the SIP associated sip = SIP.get_sip(uuid) ark = Archive.get_from_sip(uuid) # we export it to the temp folder archiver = BaseArchiver(sip) archiver.write_all_files() # we rsync it to the remote src_path = archiver.get_fullpath('') dest_path = join(config['remote_path'], ark.accession_id) dest_path = '{}:{}'.format(config['remote'], dest_path) ssh_command = 'sshpass -f {filename} ssh -l {user}'.format( filename=config['password_file'], user=config['user']) return call([ 'rsync', config['args'], '--rsh={}'.format(ssh_command), src_path, dest_path ])
def test_SIP_create(app, db, mocker): """Test the create method from SIP API.""" # we setup a file storage tmppath = tempfile.mkdtemp() db.session.add(Location(name='default', uri=tmppath, default=True)) db.session.commit() # we create a file content = b'test lol\n' bucket = Bucket.create() obj = ObjectVersion.create(bucket, 'test.txt', stream=BytesIO(content)) db.session.commit() files = [obj] # setup metadata mjson = SIPMetadataType(title='JSON Test', name='json-test', format='json', schema='url') marcxml = SIPMetadataType(title='MARC XML Test', name='marcxml-test', format='xml', schema='uri') db.session.add(mjson) db.session.add(marcxml) metadata = { 'json-test': json.dumps({'this': 'is', 'not': 'sparta'}), 'marcxml-test': '<record></record>' } # Let's create a SIP user = create_test_user('*****@*****.**') agent = {'email': '*****@*****.**', 'ip_address': '1.1.1.1'} sip = SIP.create(True, files=files, metadata=metadata, user_id=user.id, agent=agent) db.session.commit() assert SIP_.query.count() == 1 assert len(sip.files) == 1 assert len(sip.metadata) == 2 assert SIPFile.query.count() == 1 assert SIPMetadata.query.count() == 2 assert sip.user.id == user.id assert sip.agent == agent # we mock the user and the agent to test if the creation works app.config['SIPSTORE_AGENT_JSONSCHEMA_ENABLED'] = False mock_current_user = mocker.patch('invenio_sipstore.api.current_user') type(mock_current_user).is_anonymous = mocker.PropertyMock( return_value=True) sip = SIP.create(True, files=files, metadata=metadata) assert sip.model.user_id is None assert sip.user is None assert sip.agent == {} # finalization rmtree(tmppath)
def test_SIP_build_agent_info(app, mocker): """Test SIP._build_agent_info static method.""" # with no information, we get an empty dict agent = SIP._build_agent_info() assert agent == {} # we mock flask function to give more info mocker.patch('invenio_sipstore.api.has_request_context', return_value=True, autospec=True) mock_request = mocker.patch('invenio_sipstore.api.request') type(mock_request).remote_addr = mocker.PropertyMock( return_value="localhost") mock_current_user = mocker.patch('invenio_sipstore.api.current_user') type(mock_current_user).is_authenticated = mocker.PropertyMock( return_value=True) type(mock_current_user).email = mocker.PropertyMock( return_value='*****@*****.**') agent = SIP._build_agent_info() assert agent == {'ip_address': 'localhost', 'email': '*****@*****.**'}
def test_SIP_metadata(db): """Test the metadata methods of API SIP.""" # we create a SIP model sip = SIP_.create() mtype = SIPMetadataType(title='JSON Test', name='json-test', format='json', schema='url') db.session.add(mtype) db.session.commit() # We create an API SIP on top of it api_sip = SIP(sip) assert len(api_sip.metadata) == 0 # we create a dummy metadata metadata = json.dumps({'this': 'is', 'not': 'sparta'}) # we attach it to the SIP sm = api_sip.attach_metadata('json-test', metadata) db.session.commit() assert len(api_sip.metadata) == 1 assert api_sip.metadata[0].type.format == 'json' assert api_sip.metadata[0].content == metadata assert sip.sip_metadata[0].content == metadata
def test_SIP_build_agent_info(app, mocker): """Test SIP._build_agent_info static method.""" # with no information, we get an empty dict agent = SIP._build_agent_info() assert agent == {} # we mock flask function to give more info mocker.patch('invenio_sipstore.api.has_request_context', return_value=True, autospec=True) mock_request = mocker.patch('invenio_sipstore.api.request') type(mock_request).remote_addr = mocker.PropertyMock( return_value="localhost") mock_current_user = mocker.patch('invenio_sipstore.api.current_user') type(mock_current_user).is_authenticated = mocker.PropertyMock( return_value=True) type(mock_current_user).email = mocker.PropertyMock( return_value='*****@*****.**') agent = SIP._build_agent_info() assert agent == { 'ip_address': 'localhost', 'email': '*****@*****.**' }
def test_SIP(db): """Test SIP API class.""" user = create_test_user('*****@*****.**') agent = {'email': '*****@*****.**', 'ip_address': '1.1.1.1'} # we create a SIP model sip = SIP_.create(user_id=user.id, agent=agent) db.session.commit() # We create an API SIP on top of it api_sip = SIP(sip) assert api_sip.model is sip assert api_sip.id == sip.id assert api_sip.user is user assert api_sip.agent == agent assert api_sip.archivable is True assert api_sip.archived is False api_sip.archived = True db.session.commit() assert api_sip.archived is True assert sip.archived is True # test of the get method api_sip2 = SIP.get_sip(sip.id) assert api_sip2.id == api_sip.id
def test_listeners(conf, expected_status, app, db): """Test listener_sip_created and listener_record_updated functions.""" # first we change the is_archivable function app.config['ARCHIVEMATICA_ISARCHIVABLE_FACTORY'] = conf assert Archive.query.count() == 0 # let's create an SIP user = create_test_user('*****@*****.**') sip = SIP.create(True, user_id=user.id, agent={'test': 'test'}) db.session.commit() assert Archive.query.count() == 1 ark = Archive.get_from_sip(sip.id) assert ark.sip.user.id == sip.user.id assert ark.status == expected_status
def transfer_rsync(uuid, config): """Transfer the files contained in the sip to the destination. The transfer is done with a rsync. If transfer to remote, you need a valid ssh setup. This method is automatically called by the module to transfer the files. Depending on your installation, you may want to have a different behavior (copy among servers...). Then, you can create your own factory and link it into the config variable :py:data:`invenio_archivematica.config.ARCHIVEMATICA_TRANSFER_FACTORY`. The config needs to include at least the destination folder. If transfer to remote, it needs to include the user and the server. In either cases, you can include usual rsync parameters. See :py:data:`invenio_archivematica.config.ARCHIVEMATICA_TRANSFER_FOLDER`: .. code-block:: python ARCHIVEMATICA_TRANSFER_FOLDER = { 'server': 'localhost', 'user': '******', 'destination': '/tmp', 'args': '-az' } :param str uuid: the id of the sip containing files to transfer :param config: the config for rsync """ sip = SIP.get_sip(uuid) # first we copy everything in a temp folder archiver = BaseArchiver(sip) archiver.write_all_files() # then we rsync to the final dest src_path = archiver.get_fullpath('') dest_path = config['destination'] if config.get('server', None) and config.get('user', None): dest_path = '{user}@{server}:{dest}'.format(user=config['user'], server=config['server'], dest=dest_path) try: ret = call(['rsync', config['args'], src_path, dest_path]) # we remove the temp folder finally: rmtree(src_path) return ret
def oais_fail_transfer(uuid, accession_id='', archivematica_id=None): """Mark the transfer as failed. This function should be called if the transfer failed. See :py:func:`invenio_archivematica.tasks.oais_start_transfer`. The signal :py:data:`invenio_archivematica.signals.oais_transfer_failed` is called with the sip as function parameter. :param str uuid: the UUID of the sip """ ark = Archive.get_from_sip(uuid) ark.status = ArchiveStatus.FAILED ark.sip.archived = False db.session.commit() oais_transfer_failed.send(SIP(ark.sip))
def oais_process_aip(uuid, accession_id='', archivematica_id=None): """Mark the aip in progress. This function should be called if the aip is processing. See :py:func:`invenio_archivematica.tasks.oais_start_transfer`. The signal :py:data:`invenio_archivematica.signals.oais_transfer_processing` is called with the sip as function parameter. :param str uuid: the UUID of the sip :param str archivematica_id: the ID of the AIP in Archivematica """ ark = Archive.get_from_sip(uuid) ark.status = ArchiveStatus.PROCESSING_AIP ark.archivematica_id = archivematica_id db.session.commit() oais_transfer_processing.send(SIP(ark.sip))
def oais_finish_transfer(uuid, accession_id='', archivematica_id=None): """Finalize the transfer of a sip. This function should be called once the transfer has been finished, to mark the sip as correctly archived. See :py:func:`invenio_archivematica.tasks.oais_start_transfer`. The signal :py:data:`invenio_archivematica.signals.oais_transfer_finished` is called with the sip as function parameter. :param str uuid: the UUID of the sip :param str archivematica_id: the ID in Archivematica of the created AIP (should be an UUID) """ ark = Archive.get_from_sip(uuid) ark.status = ArchiveStatus.REGISTERED ark.archivematica_id = archivematica_id ark.sip.archived = True db.session.commit() oais_transfer_finished.send(SIP(ark.sip))
def test_RecordSIP(db): """Test RecordSIP API class.""" user = create_test_user('*****@*****.**') agent = {'email': '*****@*****.**', 'ip_address': '1.1.1.1'} # we create a record recid = uuid.uuid4() pid = PersistentIdentifier.create('recid', '1337', object_type='rec', object_uuid=recid, status=PIDStatus.REGISTERED) title = {'title': 'record test'} record = Record.create(title, recid) # we create the models sip = SIP.create(True, user_id=user.id, agent=agent) recordsip = RecordSIP_(sip_id=sip.id, pid_id=pid.id) db.session.commit() # We create an API SIP on top of it api_recordsip = RecordSIP(recordsip, sip) assert api_recordsip.model is recordsip assert api_recordsip.sip.id == sip.id
def transfer_cp(uuid, config): """Transfer the files contained in the sip to a local destination. The transfer is done with a simple copy of files. This method is automatically called by the module to transfer the files. Depending on your installation, you may want to have a different behavior (copy among servers...). Then, you can create your own factory and link it into the config variable :py:data:`invenio_archivematica.config.ARCHIVEMATICA_TRANSFER_FACTORY`. :param str uuid: the id of the sip containing files to transfer :param config: can be empty. It will have the content of the variable :py:data:`invenio_archivematica.config.ARCHIVEMATICA_TRANSFER_FOLDER`. However, it will use the export folder set in :py:data:`invenio_sipstore.config.SIPSTORE_ARCHIVER_LOCATION_NAME` """ sip = SIP.get_sip(uuid) archiver = BaseArchiver(sip) archiver.write_all_files() return 0
def test_RecordSIP(db): """Test RecordSIP API class.""" user = create_test_user('*****@*****.**') agent = {'email': '*****@*****.**', 'ip_address': '1.1.1.1'} # we create a record recid = uuid.uuid4() pid = PersistentIdentifier.create( 'recid', '1337', object_type='rec', object_uuid=recid, status=PIDStatus.REGISTERED) title = {'title': 'record test'} record = Record.create(title, recid) # we create the models sip = SIP.create(True, user_id=user.id, agent=agent) recordsip = RecordSIP_(sip_id=sip.id, pid_id=pid.id) db.session.commit() # We create an API SIP on top of it api_recordsip = RecordSIP(recordsip, sip) assert api_recordsip.model is recordsip assert api_recordsip.sip.id == sip.id
def test_archiving(app, db, deposit, deposit_file, locations, archive_fs): """Test ZenodoSIP archiving.""" # Stash the configuration and enable writing orig = app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] = True deposit.files['test2.txt'] = BytesIO(b'test-two') deposit_v1 = publish_and_expunge(db, deposit) recid_v1, record_v1 = deposit_v1.fetch_published() recid_v1_id = recid_v1.id # Record files after publishing: 'test.txt', 'test2.txt' sip1 = SIP(SIPModel.query.one()) sip1_id = sip1.id # Edit the metadata deposit_v1 = deposit_v1.edit() deposit_v1['title'] = "New title" deposit_v1 = publish_and_expunge(db, deposit_v1) # Record files after publishing: 'test.txt', 'test2.txt' sip2_id = SIPModel.query.order_by(SIPModel.created.desc()).first().id # Create a new version deposit_v1.newversion() recid_v1 = PersistentIdentifier.query.get(recid_v1_id) pv = PIDVersioning(child=recid_v1) depid_v2 = pv.draft_child_deposit deposit_v2 = ZenodoDeposit.get_record(depid_v2.object_uuid) del deposit_v2.files['test.txt'] deposit_v2.files['test3.txt'] = BytesIO(b('test-three')) deposit_v2 = publish_and_expunge(db, deposit_v2) # Record files after publishing: 'test2.txt', 'test3.txt' sip1 = SIP(SIPModel.query.get(sip1_id)) sip2 = SIP(SIPModel.query.get(sip2_id)) sip3 = SIP(SIPModel.query.order_by(SIPModel.created.desc()).first()) # Becase we are using secure_filename when writing SIPFiles we need to # genenarate the correct names: <SIPFile.id>-<secure_filename> s1_file1_fn = '{0}-test.txt'.format(fetch_suff(sip1, 'test.txt').file_id) s1_file1_fp = 'data/files/{0}'.format(s1_file1_fn) s1_file2_fn = '{0}-test2.txt'.format(fetch_suff(sip1, 'test2.txt').file_id) s1_file2_fp = 'data/files/{0}'.format(s1_file2_fn) s3_file2_fn = '{0}-test2.txt'.format(fetch_suff(sip3, 'test2.txt').file_id) s3_file2_fp = 'data/files/{0}'.format(s3_file2_fn) s3_file3_fn = '{0}-test3.txt'.format(fetch_suff(sip3, 'test3.txt').file_id) s3_file3_fp = 'data/files/{0}'.format(s3_file3_fn) sip1_bagmeta = json.loads( next(m.content for m in sip1.metadata if m.type.name == 'bagit'))['files'] sip2_bagmeta = json.loads( next(m.content for m in sip2.metadata if m.type.name == 'bagit'))['files'] sip3_bagmeta = json.loads( next(m.content for m in sip3.metadata if m.type.name == 'bagit'))['files'] # Check if Bagit metadata contains the correct file-fetching information assert set([f['filepath'] for f in sip1_bagmeta]) == \ set([s1_file1_fp, s1_file2_fp, 'data/filenames.txt', 'data/metadata/record-json.json', 'bag-info.txt', 'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt']) assert not BagItArchiver._is_fetched(get_m_item(sip1_bagmeta, s1_file1_fp)) assert not BagItArchiver._is_fetched(get_m_item(sip1_bagmeta, s1_file2_fp)) assert set([f['filepath'] for f in sip2_bagmeta]) == \ set([s1_file1_fp, s1_file2_fp, 'data/filenames.txt', 'data/metadata/record-json.json', 'bag-info.txt', 'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt', 'fetch.txt']) # Both files should be fetched since it's only metadata-edit submission assert BagItArchiver._is_fetched(get_m_item(sip2_bagmeta, s1_file1_fp)) assert BagItArchiver._is_fetched(get_m_item(sip2_bagmeta, s1_file2_fp)) assert set([f['filepath'] for f in sip3_bagmeta]) == \ set([s3_file2_fp, s3_file3_fp, 'data/filenames.txt', 'data/metadata/record-json.json', 'bag-info.txt', 'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt', 'fetch.txt']) # First file should be fetched from previous version and new file should # be archived in this bag. assert BagItArchiver._is_fetched(get_m_item(sip3_bagmeta, s3_file2_fp)) assert not BagItArchiver._is_fetched(get_m_item(sip3_bagmeta, s3_file3_fp)) archiver1 = BagItArchiver(sip1) archiver2 = BagItArchiver(sip2) archiver3 = BagItArchiver(sip3) # Each archiver subpath follows: '<recid>/r/<ISO-8601-SIP-timestamp>' sip1_ts = arrow.get(sip1.model.created).isoformat() sip2_ts = arrow.get(sip2.model.created).isoformat() sip3_ts = arrow.get(sip3.model.created).isoformat() assert archiver1.get_archive_subpath() == '2/r/{0}'.format(sip1_ts) assert archiver2.get_archive_subpath() == '2/r/{0}'.format(sip2_ts) assert archiver3.get_archive_subpath() == '3/r/{0}'.format(sip3_ts) # As a test, write the SIPs in reverse chronological order assert not sip1.archived assert not sip2.archived assert not sip3.archived archive_sip.delay(sip3.id) archive_sip.delay(sip2.id) archive_sip.delay(sip1.id) assert sip1.archived assert sip2.archived assert sip3.archived fs1 = archive_fs.opendir(archiver1.get_archive_subpath()) assert set(fs1.listdir()) == set([ 'tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt', 'data' ]) assert set(fs1.listdir('data')) == set( ['metadata', 'files', 'filenames.txt']) assert fs1.listdir('data/metadata') == [ 'record-json.json', ] assert set(fs1.listdir('data/files')) == set([s1_file1_fn, s1_file2_fn]) fs2 = archive_fs.opendir(archiver2.get_archive_subpath()) assert set(fs2.listdir()) == set([ 'tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt', 'data', 'fetch.txt' ]) # Second SIP has written only the metadata, # because of that There should be no 'files/', but 'filesnames.txt' should # still be there becasue of the fetch.txt assert set(fs2.listdir('data')) == set(['metadata', 'filenames.txt']) assert fs2.listdir('data/metadata') == [ 'record-json.json', ] with fs2.open('fetch.txt') as fp: cnt = fp.read().splitlines() # Fetched files should correctly fetch the files from the first archive base_uri = archiver1.get_archive_base_uri() assert set(cnt) == set([ '{base}/2/r/{s1ts}/{fn} 4 {fn}'.format(fn=s1_file1_fp, base=base_uri, s1ts=sip1_ts), '{base}/2/r/{s1ts}/{fn} 8 {fn}'.format(fn=s1_file2_fp, base=base_uri, s1ts=sip1_ts), ]) fs3 = archive_fs.opendir(archiver3.get_archive_subpath()) assert set(fs3.listdir()) == set([ 'tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt', 'data', 'fetch.txt' ]) # Third SIP should write only the extra 'test3.txt' file assert set(fs3.listdir('data')) == set( ['metadata', 'files', 'filenames.txt']) assert fs3.listdir('data/metadata') == [ 'record-json.json', ] assert fs3.listdir('data/files') == [ s3_file3_fn, ] with fs3.open('fetch.txt') as fp: cnt = fp.read().splitlines() # Since 'file.txt' was removed in third SIP, we should only fetch the # 'test2.txt', also from the first archive, since that's where this # file resides physically. base_uri = archiver1.get_archive_base_uri() assert set(cnt) == set([ '{base}/2/r/{s1ts}/{fn} 8 {fn}'.format(fn=s3_file2_fp, base=base_uri, s1ts=sip1_ts), ]) app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] = orig
def __init__(self, sip, data_dir='data/files', metadata_dir='data/metadata', extra_dir='', patch_of=None, include_all_previous=False, tags=None, filenames_mapping_file='data/filenames.txt'): """Constructor of the BagIt Archiver. When specifying 'patch_of' parameter the 'include_all_previous' flag determines whether files that are missing in the archived SIP (w.r.t. the SIP specified in 'patch_of') should be treated as explicitly deleted (include_all_previous=False) or if they should still be included in the manifest. Example: include_all_previous = True SIP_1: SIPFiles: a.txt, b.txt BagIt Manifest: a.txt, b.txt SIP_2 (Bagged with patch_of=SIP_1): SIPFiles: b.txt, c.txt BagIt Manifest: a.txt, b.txt, c.txt fetch.txt: a.txt, b.txt include_all_previous = False SIP_1: SIPFiles: a.txt, b.txt BagIt Manifest: a.txt, b.txt SIP_2 (Bagged with patch_of=SIP_1): SIPFIles: b.txt, c.txt BagIt Manifest: b.txt, c.txt fetch.txt: b.txt :param sip: API instance of the SIP that is to be archived. :type sip: :py:class:`invenio_sipstore.api.SIP` or :py:class:`invenio_sipstore.models.SIP` :param data_dir: directory where the SIPFiles will be written. :param metadata_dir: directory where the SIPMetadata will be written. :param extra_dir: directory where all extra files will be written, including the BagIt-specific files. :param patch_of: Write a 'lightweight' bag, which will archive only the new SIPFiles, and refer to the repeated ones in "fetch.txt" file. The provided argument is a SIP API, which will be taken as a base for determining the "diff" between two bags. :type patch_of: :py:class:`invenio_sipstore.api.SIP` or :py:class:`invenio_sipstore.models.SIP` :type bool include_missing_files: If set to True and if 'patch_of' is used, include the files that are missing in the SIP w.r.t. to the 'patch_of' SIP in the manifest. The opposite (include_missing_files=False) is equivalent to treating those as explicitly deleted - the files will not be included in the manifest, nor in the "fetch.txt" file. :param tags: a list of 2-tuple containing the tags of the bagit, which will be written to the 'bag-info.txt' file. :param filenames_mapping_file: filepath of the file in the archive which contains all of SIPFile mappings. If this parameter is boolean-resolvable as False, the file will not be created. """ super(BagItArchiver, self).__init__( sip, data_dir=data_dir, metadata_dir=metadata_dir, extra_dir=extra_dir, filenames_mapping_file=filenames_mapping_file) self.tags = tags or current_app.config['SIPSTORE_BAGIT_TAGS'] self.patch_of = (patch_of if isinstance(patch_of, SIP) else SIP(patch_of)) if patch_of else None self.include_all_previous = include_all_previous
def sips(db, locations, sip_metadata_types): """Fixture for the SIP objects sharing multiple files. Four SIPs are sharing three files in the following way: SIP-1: File1 SIP-2: File1, File2 SIP-3: File2(renamed on SIPFile, but same FileInstance), File3 SIP-4: File4, File5, File6 """ # A SIP with agent info sip1 = SIP.create(agent={ 'email': '*****@*****.**', 'orcid': '1111-1111-1111-1111', 'ip_address': '1.1.1.1' }) sip1api = SIPApi(sip1) sip1api.attach_metadata('marcxml-test', '<p>XML 1</p>') sip1api.attach_metadata('json-test', '{"title": "JSON 1"}') # Metadata 'txt-test', although attached should not be archived # (see conftest configuration) sip1api.attach_metadata('txt-test', 'Title: TXT 1') file1 = FileInstance.create() file1.set_contents(BytesIO(b('test')), default_location=locations['default'].uri) sip1file1 = SIPFile(sip_id=sip1.id, filepath="foobar.txt", file_id=file1.id) db_.session.add(sip1file1) sip2 = SIP.create() sip2api = SIPApi(sip2) sip2api.attach_metadata('marcxml-test', '<p>XML 2</p>') sip2api.attach_metadata('json-test', '{"title": "JSON 2"}') file2 = FileInstance.create() file2.set_contents(BytesIO(b'test-second'), default_location=locations['default'].uri) sip2file1 = SIPFile(sip_id=sip2.id, filepath="foobar.txt", file_id=file1.id) sip2file2 = SIPFile(sip_id=sip2.id, filepath="foobar2.txt", file_id=file2.id) db_.session.add(sip2file1) db_.session.add(sip2file2) sip3 = SIP.create() sip3api = SIPApi(sip3) sip3api.attach_metadata('marcxml-test', '<p>XML 3</p>') sip3api.attach_metadata('json-test', '{"title": "JSON 3"}') file3 = FileInstance.create() file3.set_contents(BytesIO(b'test-third'), default_location=locations['default'].uri) sip3file2 = SIPFile(sip_id=sip3.id, filepath="foobar2-renamed.txt", file_id=file2.id) sip3file3 = SIPFile(sip_id=sip3.id, filepath="foobar3.txt", file_id=file3.id) db_.session.add(sip3file2) db_.session.add(sip3file3) # A SIP with naughty filenames sip4 = SIP.create() sip4api = SIPApi(sip4) sip4api.attach_metadata('marcxml-test', '<p>XML 4 żółć</p>') sip4api.attach_metadata('json-test', '{"title": "JSON 4 żółć"}') file4 = FileInstance.create() file4.set_contents(BytesIO('test-fourth żółć'.encode('utf-8')), default_location=locations['default'].uri) file5 = FileInstance.create() file5.set_contents(BytesIO('test-fifth ąęćźə'.encode('utf-8')), default_location=locations['default'].uri) file6 = FileInstance.create() file6.set_contents(BytesIO('test-sixth π'.encode('utf-8')), default_location=locations['default'].uri) sip5file4 = SIPFile(sip_id=sip4.id, filepath="../../foobar.txt", file_id=file4.id) sip5file5 = SIPFile(sip_id=sip4.id, filepath="http://maliciouswebsite.com/hack.js", file_id=file5.id) sip5file6 = SIPFile(sip_id=sip4.id, filepath="łóżźćąę.dat", file_id=file6.id) db_.session.add(sip5file4) db_.session.add(sip5file5) db_.session.add(sip5file6) # A SIP with metadata-only changes sip5 = SIP.create() sip5api = SIPApi(sip5) sip5api.attach_metadata('marcxml-test', '<p>XML 5 Meta Only</p>') db_.session.commit() return [sip1api, sip2api, sip3api, sip4api, sip5api]
def sips(db, locations, sip_metadata_types): """Fixture for the SIP objects sharing multiple files. Four SIPs are sharing three files in the following way: SIP-1: File1 SIP-2: File1, File2 SIP-3: File2(renamed on SIPFile, but same FileInstance), File3 SIP-4: File4, File5, File6 """ # A SIP with agent info sip1 = SIP.create( agent={ 'email': '*****@*****.**', 'orcid': '1111-1111-1111-1111', 'ip_address': '1.1.1.1' }) sip1api = SIPApi(sip1) sip1api.attach_metadata('marcxml-test', '<p>XML 1</p>') sip1api.attach_metadata('json-test', '{"title": "JSON 1"}') # Metadata 'txt-test', although attached should not be archived # (see conftest configuration) sip1api.attach_metadata('txt-test', 'Title: TXT 1') file1 = FileInstance.create() file1.set_contents(BytesIO(b('test')), default_location=locations['default'].uri) sip1file1 = SIPFile(sip_id=sip1.id, filepath="foobar.txt", file_id=file1.id) db_.session.add(sip1file1) sip2 = SIP.create() sip2api = SIPApi(sip2) sip2api.attach_metadata('marcxml-test', '<p>XML 2</p>') sip2api.attach_metadata('json-test', '{"title": "JSON 2"}') file2 = FileInstance.create() file2.set_contents(BytesIO(b'test-second'), default_location=locations['default'].uri) sip2file1 = SIPFile(sip_id=sip2.id, filepath="foobar.txt", file_id=file1.id) sip2file2 = SIPFile(sip_id=sip2.id, filepath="foobar2.txt", file_id=file2.id) db_.session.add(sip2file1) db_.session.add(sip2file2) sip3 = SIP.create() sip3api = SIPApi(sip3) sip3api.attach_metadata('marcxml-test', '<p>XML 3</p>') sip3api.attach_metadata('json-test', '{"title": "JSON 3"}') file3 = FileInstance.create() file3.set_contents(BytesIO(b'test-third'), default_location=locations['default'].uri) sip3file2 = SIPFile(sip_id=sip3.id, filepath="foobar2-renamed.txt", file_id=file2.id) sip3file3 = SIPFile(sip_id=sip3.id, filepath="foobar3.txt", file_id=file3.id) db_.session.add(sip3file2) db_.session.add(sip3file3) # A SIP with naughty filenames sip4 = SIP.create() sip4api = SIPApi(sip4) sip4api.attach_metadata('marcxml-test', '<p>XML 4 żółć</p>') sip4api.attach_metadata('json-test', '{"title": "JSON 4 żółć"}') file4 = FileInstance.create() file4.set_contents(BytesIO('test-fourth żółć'.encode('utf-8')), default_location=locations['default'].uri) file5 = FileInstance.create() file5.set_contents(BytesIO('test-fifth ąęćźə'.encode('utf-8')), default_location=locations['default'].uri) file6 = FileInstance.create() file6.set_contents(BytesIO('test-sixth π'.encode('utf-8')), default_location=locations['default'].uri) sip5file4 = SIPFile(sip_id=sip4.id, filepath="../../foobar.txt", file_id=file4.id) sip5file5 = SIPFile(sip_id=sip4.id, filepath="http://maliciouswebsite.com/hack.js", file_id=file5.id) sip5file6 = SIPFile(sip_id=sip4.id, filepath="łóżźćąę.dat", file_id=file6.id) db_.session.add(sip5file4) db_.session.add(sip5file5) db_.session.add(sip5file6) # A SIP with metadata-only changes sip5 = SIP.create() sip5api = SIPApi(sip5) sip5api.attach_metadata('marcxml-test', '<p>XML 5 Meta Only</p>') db_.session.commit() return [sip1api, sip2api, sip3api, sip4api, sip5api]