def test_SIP_files(db):
    """Test the files methods of API SIP."""
    # we create a SIP model
    sip = SIP_.create()
    db.session.commit()
    # We create an API SIP on top of it
    api_sip = SIP(sip)
    assert len(api_sip.files) == 0
    # we setup a file storage
    tmppath = tempfile.mkdtemp()
    db.session.add(Location(name='default', uri=tmppath, default=True))
    db.session.commit()
    # we create a file
    content = b'test lol\n'
    bucket = Bucket.create()
    obj = ObjectVersion.create(bucket, 'test.txt', stream=BytesIO(content))
    db.session.commit()
    # we attach it to the SIP
    sf = api_sip.attach_file(obj)
    db.session.commit()
    assert len(api_sip.files) == 1
    assert api_sip.files[0].filepath == 'test.txt'
    assert sip.sip_files[0].filepath == 'test.txt'
    # finalization
    rmtree(tmppath)
Exemple #2
0
def test_SIP_files(db):
    """Test the files methods of API SIP."""
    # we create a SIP model
    sip = SIP_.create()
    db.session.commit()
    # We create an API SIP on top of it
    api_sip = SIP(sip)
    assert len(api_sip.files) == 0
    # we setup a file storage
    tmppath = tempfile.mkdtemp()
    db.session.add(Location(name='default', uri=tmppath, default=True))
    db.session.commit()
    # we create a file
    content = b'test lol\n'
    bucket = Bucket.create()
    obj = ObjectVersion.create(bucket, 'test.txt', stream=BytesIO(content))
    db.session.commit()
    # we attach it to the SIP
    sf = api_sip.attach_file(obj)
    db.session.commit()
    assert len(api_sip.files) == 1
    assert api_sip.files[0].filepath == 'test.txt'
    assert sip.sip_files[0].filepath == 'test.txt'
    # finalization
    rmtree(tmppath)
Exemple #3
0
def archive_sip(sip_uuid):
    """Send the SIP for archiving.

    Retries every 4 hours, six times, which should work for up to 24 hours
    archiving system downtime.

    :param sip_uuid: UUID of the SIP for archiving.
    :type sip_uuid: str
    """
    try:
        sip = SIPApi(SIP.query.get(sip_uuid))
        archiver = BagItArchiver(sip)
        bagmeta = archiver.get_bagit_metadata(sip)
        if bagmeta is None:
            raise ArchivingError(
                'Bagit metadata does not exist for SIP: {0}.'.format(sip.id))
        if sip.archived:
            raise ArchivingError(
                'SIP was already archived {0}.'.format(sip.id))
        archiver.write_all_files()
        sip.archived = True
        db.session.commit()
    except Exception as exc:
        # On ArchivingError (see above), do not retry, but re-raise
        if not isinstance(exc, ArchivingError):
            archive_sip.retry(exc=exc)
        raise
Exemple #4
0
def test_SIP_create(app, db, mocker):
    """Test the create method from SIP API."""
    # we setup a file storage
    tmppath = tempfile.mkdtemp()
    db.session.add(Location(name='default', uri=tmppath, default=True))
    db.session.commit()
    # we create a file
    content = b'test lol\n'
    bucket = Bucket.create()
    obj = ObjectVersion.create(bucket, 'test.txt', stream=BytesIO(content))
    db.session.commit()
    files = [obj]
    # setup metadata
    mjson = SIPMetadataType(title='JSON Test',
                            name='json-test',
                            format='json',
                            schema='url')
    marcxml = SIPMetadataType(title='MARC XML Test',
                              name='marcxml-test',
                              format='xml',
                              schema='uri')
    db.session.add(mjson)
    db.session.add(marcxml)
    metadata = {
        'json-test': json.dumps({
            'this': 'is',
            'not': 'sparta'
        }),
        'marcxml-test': '<record></record>'
    }
    # Let's create a SIP
    user = create_test_user('*****@*****.**')
    agent = {'email': '*****@*****.**', 'ip_address': '1.1.1.1'}
    sip = SIP.create(True,
                     files=files,
                     metadata=metadata,
                     user_id=user.id,
                     agent=agent)
    db.session.commit()
    assert SIP_.query.count() == 1
    assert len(sip.files) == 1
    assert len(sip.metadata) == 2
    assert SIPFile.query.count() == 1
    assert SIPMetadata.query.count() == 2
    assert sip.user.id == user.id
    assert sip.agent == agent
    # we mock the user and the agent to test if the creation works
    app.config['SIPSTORE_AGENT_JSONSCHEMA_ENABLED'] = False
    mock_current_user = mocker.patch('invenio_sipstore.api.current_user')
    type(mock_current_user).is_anonymous = mocker.PropertyMock(
        return_value=True)
    sip = SIP.create(True, files=files, metadata=metadata)
    assert sip.model.user_id is None
    assert sip.user is None
    assert sip.agent == {}
    # finalization
    rmtree(tmppath)
def oais_start_transfer(uuid, accession_id='', archivematica_id=None):
    """Archive a sip.

    This function should be called to start a transfer to archive a sip.
    Once the transfer is finished, you should call
    :py:func:`invenio_archivematica.tasks.oais_finish_transfer`.

    The signal :py:data:`invenio_archivematica.signals.oais_transfer_started`
    is called with the sip as function parameter.

    :param str uuid: the UUID of the sip to archive
    :param str accession_id: the AIP accession ID. You can generate one from
    :py:func:`invenio_archivematica.factories.create_accession_id`
    """
    # we get the sip
    sip = SIP.get_sip(uuid)
    # we register the sip as being processed
    ark = Archive.get_from_sip(uuid)
    if not ark:
        ark = Archive.create(sip.model)
    ark.accession_id = accession_id
    ark.status = ArchiveStatus.WAITING
    # we start the transfer
    imp = current_app.config['ARCHIVEMATICA_TRANSFER_FACTORY']
    transfer = import_string(imp)
    ret = transfer(sip.id, current_app.config['ARCHIVEMATICA_TRANSFER_FOLDER'])
    if ret == 0:
        db.session.commit()
        oais_transfer_started.send(sip)
        return
    oais_fail_transfer(uuid, accession_id)
Exemple #6
0
def test_Archive(db):
    """Test the Archive model class."""
    assert Archive.query.count() == 0
    # we create an SIP, it will automatically create an Archive via signals
    user = create_test_user('*****@*****.**')
    sip = SIP.create(True, user_id=user.id, agent={'test': 'test'})
    db.session.commit()

    assert Archive.query.count() == 1
    ark = Archive.get_from_sip(sip.id)
    assert ark.sip.user.id == sip.user.id
    assert ark.status == ArchiveStatus.NEW
    assert ark.accession_id is None
    assert ark.archivematica_id is None
    # let's change the object
    ark.status = ArchiveStatus.REGISTERED
    ark.accession_id = '08'
    ark.archivematica_id = sip.id
    db.session.commit()
    ark = Archive.get_from_accession_id('08')
    assert Archive.query.count() == 1
    assert ark.status == ArchiveStatus.REGISTERED
    assert ark.archivematica_id == sip.id
    # we try to get a non existing record
    assert Archive.get_from_sip(uuid.uuid4()) is None
Exemple #7
0
def transfer_demo(uuid, config):
    """Transfer the files contained in the sip to the destination.

    Very similar to the rsync transfer. However, because of time, I use the
    VERY UNSECURE sshpass package for rsync authentication.
    DO NOT USE IN PROD!!!

    :param str uuid: the id of the sip containing files to transfer
    :param dict config: here config must be a dict with the following keys:
        - user - the SSH user
        - password_file - a path where the password is stored
        - remote - the URL or IP of the remote
        - remote_path - where to store files on the remote
        - args - the args for rsync
    """
    # we retrieve the archive and the SIP associated
    sip = SIP.get_sip(uuid)
    ark = Archive.get_from_sip(uuid)

    # we export it to the temp folder
    archiver = BaseArchiver(sip)
    archiver.write_all_files()

    # we rsync it to the remote
    src_path = archiver.get_fullpath('')
    dest_path = join(config['remote_path'], ark.accession_id)
    dest_path = '{}:{}'.format(config['remote'], dest_path)
    ssh_command = 'sshpass -f {filename} ssh -l {user}'.format(
        filename=config['password_file'], user=config['user'])
    return call([
        'rsync', config['args'], '--rsh={}'.format(ssh_command), src_path,
        dest_path
    ])
def test_SIP_create(app, db, mocker):
    """Test the create method from SIP API."""
    # we setup a file storage
    tmppath = tempfile.mkdtemp()
    db.session.add(Location(name='default', uri=tmppath, default=True))
    db.session.commit()
    # we create a file
    content = b'test lol\n'
    bucket = Bucket.create()
    obj = ObjectVersion.create(bucket, 'test.txt', stream=BytesIO(content))
    db.session.commit()
    files = [obj]
    # setup metadata
    mjson = SIPMetadataType(title='JSON Test', name='json-test',
                            format='json', schema='url')
    marcxml = SIPMetadataType(title='MARC XML Test', name='marcxml-test',
                              format='xml', schema='uri')
    db.session.add(mjson)
    db.session.add(marcxml)
    metadata = {
        'json-test': json.dumps({'this': 'is', 'not': 'sparta'}),
        'marcxml-test': '<record></record>'
    }
    # Let's create a SIP
    user = create_test_user('*****@*****.**')
    agent = {'email': '*****@*****.**', 'ip_address': '1.1.1.1'}
    sip = SIP.create(True, files=files, metadata=metadata, user_id=user.id,
                     agent=agent)
    db.session.commit()
    assert SIP_.query.count() == 1
    assert len(sip.files) == 1
    assert len(sip.metadata) == 2
    assert SIPFile.query.count() == 1
    assert SIPMetadata.query.count() == 2
    assert sip.user.id == user.id
    assert sip.agent == agent
    # we mock the user and the agent to test if the creation works
    app.config['SIPSTORE_AGENT_JSONSCHEMA_ENABLED'] = False
    mock_current_user = mocker.patch('invenio_sipstore.api.current_user')
    type(mock_current_user).is_anonymous = mocker.PropertyMock(
        return_value=True)
    sip = SIP.create(True, files=files, metadata=metadata)
    assert sip.model.user_id is None
    assert sip.user is None
    assert sip.agent == {}
    # finalization
    rmtree(tmppath)
Exemple #9
0
def test_SIP_build_agent_info(app, mocker):
    """Test SIP._build_agent_info static method."""
    # with no information, we get an empty dict
    agent = SIP._build_agent_info()
    assert agent == {}
    # we mock flask function to give more info
    mocker.patch('invenio_sipstore.api.has_request_context',
                 return_value=True,
                 autospec=True)
    mock_request = mocker.patch('invenio_sipstore.api.request')
    type(mock_request).remote_addr = mocker.PropertyMock(
        return_value="localhost")
    mock_current_user = mocker.patch('invenio_sipstore.api.current_user')
    type(mock_current_user).is_authenticated = mocker.PropertyMock(
        return_value=True)
    type(mock_current_user).email = mocker.PropertyMock(
        return_value='*****@*****.**')
    agent = SIP._build_agent_info()
    assert agent == {'ip_address': 'localhost', 'email': '*****@*****.**'}
def test_SIP_metadata(db):
    """Test the metadata methods of API SIP."""
    # we create a SIP model
    sip = SIP_.create()
    mtype = SIPMetadataType(title='JSON Test', name='json-test',
                            format='json', schema='url')
    db.session.add(mtype)
    db.session.commit()
    # We create an API SIP on top of it
    api_sip = SIP(sip)
    assert len(api_sip.metadata) == 0
    # we create a dummy metadata
    metadata = json.dumps({'this': 'is', 'not': 'sparta'})
    # we attach it to the SIP
    sm = api_sip.attach_metadata('json-test', metadata)
    db.session.commit()
    assert len(api_sip.metadata) == 1
    assert api_sip.metadata[0].type.format == 'json'
    assert api_sip.metadata[0].content == metadata
    assert sip.sip_metadata[0].content == metadata
def test_SIP_build_agent_info(app, mocker):
    """Test SIP._build_agent_info static method."""
    # with no information, we get an empty dict
    agent = SIP._build_agent_info()
    assert agent == {}
    # we mock flask function to give more info
    mocker.patch('invenio_sipstore.api.has_request_context',
                 return_value=True, autospec=True)
    mock_request = mocker.patch('invenio_sipstore.api.request')
    type(mock_request).remote_addr = mocker.PropertyMock(
        return_value="localhost")
    mock_current_user = mocker.patch('invenio_sipstore.api.current_user')
    type(mock_current_user).is_authenticated = mocker.PropertyMock(
        return_value=True)
    type(mock_current_user).email = mocker.PropertyMock(
        return_value='*****@*****.**')
    agent = SIP._build_agent_info()
    assert agent == {
        'ip_address': 'localhost',
        'email': '*****@*****.**'
    }
def test_SIP(db):
    """Test SIP API class."""
    user = create_test_user('*****@*****.**')
    agent = {'email': '*****@*****.**', 'ip_address': '1.1.1.1'}
    # we create a SIP model
    sip = SIP_.create(user_id=user.id, agent=agent)
    db.session.commit()
    # We create an API SIP on top of it
    api_sip = SIP(sip)
    assert api_sip.model is sip
    assert api_sip.id == sip.id
    assert api_sip.user is user
    assert api_sip.agent == agent
    assert api_sip.archivable is True
    assert api_sip.archived is False
    api_sip.archived = True
    db.session.commit()
    assert api_sip.archived is True
    assert sip.archived is True
    # test of the get method
    api_sip2 = SIP.get_sip(sip.id)
    assert api_sip2.id == api_sip.id
Exemple #13
0
def test_SIP(db):
    """Test SIP API class."""
    user = create_test_user('*****@*****.**')
    agent = {'email': '*****@*****.**', 'ip_address': '1.1.1.1'}
    # we create a SIP model
    sip = SIP_.create(user_id=user.id, agent=agent)
    db.session.commit()
    # We create an API SIP on top of it
    api_sip = SIP(sip)
    assert api_sip.model is sip
    assert api_sip.id == sip.id
    assert api_sip.user is user
    assert api_sip.agent == agent
    assert api_sip.archivable is True
    assert api_sip.archived is False
    api_sip.archived = True
    db.session.commit()
    assert api_sip.archived is True
    assert sip.archived is True
    # test of the get method
    api_sip2 = SIP.get_sip(sip.id)
    assert api_sip2.id == api_sip.id
Exemple #14
0
def test_SIP_metadata(db):
    """Test the metadata methods of API SIP."""
    # we create a SIP model
    sip = SIP_.create()
    mtype = SIPMetadataType(title='JSON Test',
                            name='json-test',
                            format='json',
                            schema='url')
    db.session.add(mtype)
    db.session.commit()
    # We create an API SIP on top of it
    api_sip = SIP(sip)
    assert len(api_sip.metadata) == 0
    # we create a dummy metadata
    metadata = json.dumps({'this': 'is', 'not': 'sparta'})
    # we attach it to the SIP
    sm = api_sip.attach_metadata('json-test', metadata)
    db.session.commit()
    assert len(api_sip.metadata) == 1
    assert api_sip.metadata[0].type.format == 'json'
    assert api_sip.metadata[0].content == metadata
    assert sip.sip_metadata[0].content == metadata
Exemple #15
0
def test_listeners(conf, expected_status, app, db):
    """Test listener_sip_created and listener_record_updated functions."""
    # first we change the is_archivable function
    app.config['ARCHIVEMATICA_ISARCHIVABLE_FACTORY'] = conf

    assert Archive.query.count() == 0
    # let's create an SIP
    user = create_test_user('*****@*****.**')
    sip = SIP.create(True, user_id=user.id, agent={'test': 'test'})
    db.session.commit()

    assert Archive.query.count() == 1
    ark = Archive.get_from_sip(sip.id)
    assert ark.sip.user.id == sip.user.id
    assert ark.status == expected_status
Exemple #16
0
def transfer_rsync(uuid, config):
    """Transfer the files contained in the sip to the destination.

    The transfer is done with a rsync. If transfer to remote, you need a valid
    ssh setup.

    This method is automatically called by the module to transfer the files.
    Depending on your installation, you may want to have a different behavior
    (copy among servers...). Then, you can create your own factory and link it
    into the config variable
    :py:data:`invenio_archivematica.config.ARCHIVEMATICA_TRANSFER_FACTORY`.

    The config needs to include at least the destination folder. If transfer
    to remote, it needs to include the user and the server. In either cases,
    you can include usual rsync parameters. See
    :py:data:`invenio_archivematica.config.ARCHIVEMATICA_TRANSFER_FOLDER`:

    .. code-block:: python

        ARCHIVEMATICA_TRANSFER_FOLDER = {
            'server': 'localhost',
            'user': '******',
            'destination': '/tmp',
            'args': '-az'
        }

    :param str uuid: the id of the sip containing files to transfer
    :param config: the config for rsync
    """
    sip = SIP.get_sip(uuid)

    # first we copy everything in a temp folder
    archiver = BaseArchiver(sip)
    archiver.write_all_files()

    # then we rsync to the final dest
    src_path = archiver.get_fullpath('')
    dest_path = config['destination']
    if config.get('server', None) and config.get('user', None):
        dest_path = '{user}@{server}:{dest}'.format(user=config['user'],
                                                    server=config['server'],
                                                    dest=dest_path)
    try:
        ret = call(['rsync', config['args'], src_path, dest_path])
    # we remove the temp folder
    finally:
        rmtree(src_path)
    return ret
def oais_fail_transfer(uuid, accession_id='', archivematica_id=None):
    """Mark the transfer as failed.

    This function should be called if the transfer failed. See
    :py:func:`invenio_archivematica.tasks.oais_start_transfer`.

    The signal :py:data:`invenio_archivematica.signals.oais_transfer_failed`
    is called with the sip as function parameter.

    :param str uuid: the UUID of the sip
    """
    ark = Archive.get_from_sip(uuid)
    ark.status = ArchiveStatus.FAILED
    ark.sip.archived = False

    db.session.commit()
    oais_transfer_failed.send(SIP(ark.sip))
def oais_process_aip(uuid, accession_id='', archivematica_id=None):
    """Mark the aip in progress.

    This function should be called if the aip is processing. See
    :py:func:`invenio_archivematica.tasks.oais_start_transfer`.

    The signal
    :py:data:`invenio_archivematica.signals.oais_transfer_processing`
    is called with the sip as function parameter.

    :param str uuid: the UUID of the sip
    :param str archivematica_id: the ID of the AIP in Archivematica
    """
    ark = Archive.get_from_sip(uuid)
    ark.status = ArchiveStatus.PROCESSING_AIP
    ark.archivematica_id = archivematica_id

    db.session.commit()
    oais_transfer_processing.send(SIP(ark.sip))
def oais_finish_transfer(uuid, accession_id='', archivematica_id=None):
    """Finalize the transfer of a sip.

    This function should be called once the transfer has been finished, to
    mark the sip as correctly archived. See
    :py:func:`invenio_archivematica.tasks.oais_start_transfer`.

    The signal :py:data:`invenio_archivematica.signals.oais_transfer_finished`
    is called with the sip as function parameter.

    :param str uuid: the UUID of the sip
    :param str archivematica_id: the ID in Archivematica of the created AIP
        (should be an UUID)
    """
    ark = Archive.get_from_sip(uuid)
    ark.status = ArchiveStatus.REGISTERED
    ark.archivematica_id = archivematica_id
    ark.sip.archived = True

    db.session.commit()
    oais_transfer_finished.send(SIP(ark.sip))
Exemple #20
0
def test_RecordSIP(db):
    """Test RecordSIP API class."""
    user = create_test_user('*****@*****.**')
    agent = {'email': '*****@*****.**', 'ip_address': '1.1.1.1'}
    # we create a record
    recid = uuid.uuid4()
    pid = PersistentIdentifier.create('recid',
                                      '1337',
                                      object_type='rec',
                                      object_uuid=recid,
                                      status=PIDStatus.REGISTERED)
    title = {'title': 'record test'}
    record = Record.create(title, recid)
    # we create the models
    sip = SIP.create(True, user_id=user.id, agent=agent)
    recordsip = RecordSIP_(sip_id=sip.id, pid_id=pid.id)
    db.session.commit()
    # We create an API SIP on top of it
    api_recordsip = RecordSIP(recordsip, sip)
    assert api_recordsip.model is recordsip
    assert api_recordsip.sip.id == sip.id
Exemple #21
0
def transfer_cp(uuid, config):
    """Transfer the files contained in the sip to a local destination.

    The transfer is done with a simple copy of files.

    This method is automatically called by the module to transfer the files.
    Depending on your installation, you may want to have a different behavior
    (copy among servers...). Then, you can create your own factory and link it
    into the config variable
    :py:data:`invenio_archivematica.config.ARCHIVEMATICA_TRANSFER_FACTORY`.

    :param str uuid: the id of the sip containing files to transfer
    :param config: can be empty. It will have the content of the variable
        :py:data:`invenio_archivematica.config.ARCHIVEMATICA_TRANSFER_FOLDER`.
        However, it will use the export folder set in
        :py:data:`invenio_sipstore.config.SIPSTORE_ARCHIVER_LOCATION_NAME`
    """
    sip = SIP.get_sip(uuid)
    archiver = BaseArchiver(sip)
    archiver.write_all_files()
    return 0
def test_RecordSIP(db):
    """Test RecordSIP API class."""
    user = create_test_user('*****@*****.**')
    agent = {'email': '*****@*****.**', 'ip_address': '1.1.1.1'}
    # we create a record
    recid = uuid.uuid4()
    pid = PersistentIdentifier.create(
        'recid',
        '1337',
        object_type='rec',
        object_uuid=recid,
        status=PIDStatus.REGISTERED)
    title = {'title': 'record test'}
    record = Record.create(title, recid)
    # we create the models
    sip = SIP.create(True, user_id=user.id, agent=agent)
    recordsip = RecordSIP_(sip_id=sip.id, pid_id=pid.id)
    db.session.commit()
    # We create an API SIP on top of it
    api_recordsip = RecordSIP(recordsip, sip)
    assert api_recordsip.model is recordsip
    assert api_recordsip.sip.id == sip.id
Exemple #23
0
def test_archiving(app, db, deposit, deposit_file, locations, archive_fs):
    """Test ZenodoSIP archiving."""
    # Stash the configuration and enable writing
    orig = app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED']
    app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] = True
    deposit.files['test2.txt'] = BytesIO(b'test-two')
    deposit_v1 = publish_and_expunge(db, deposit)
    recid_v1, record_v1 = deposit_v1.fetch_published()
    recid_v1_id = recid_v1.id
    # Record files after publishing: 'test.txt', 'test2.txt'

    sip1 = SIP(SIPModel.query.one())
    sip1_id = sip1.id

    # Edit the metadata
    deposit_v1 = deposit_v1.edit()
    deposit_v1['title'] = "New title"
    deposit_v1 = publish_and_expunge(db, deposit_v1)
    # Record files after publishing: 'test.txt', 'test2.txt'
    sip2_id = SIPModel.query.order_by(SIPModel.created.desc()).first().id

    # Create a new version
    deposit_v1.newversion()
    recid_v1 = PersistentIdentifier.query.get(recid_v1_id)
    pv = PIDVersioning(child=recid_v1)
    depid_v2 = pv.draft_child_deposit
    deposit_v2 = ZenodoDeposit.get_record(depid_v2.object_uuid)
    del deposit_v2.files['test.txt']
    deposit_v2.files['test3.txt'] = BytesIO(b('test-three'))
    deposit_v2 = publish_and_expunge(db, deposit_v2)
    # Record files after publishing: 'test2.txt', 'test3.txt'

    sip1 = SIP(SIPModel.query.get(sip1_id))
    sip2 = SIP(SIPModel.query.get(sip2_id))
    sip3 = SIP(SIPModel.query.order_by(SIPModel.created.desc()).first())

    # Becase we are using secure_filename when writing SIPFiles we need to
    # genenarate the correct names: <SIPFile.id>-<secure_filename>
    s1_file1_fn = '{0}-test.txt'.format(fetch_suff(sip1, 'test.txt').file_id)
    s1_file1_fp = 'data/files/{0}'.format(s1_file1_fn)

    s1_file2_fn = '{0}-test2.txt'.format(fetch_suff(sip1, 'test2.txt').file_id)
    s1_file2_fp = 'data/files/{0}'.format(s1_file2_fn)

    s3_file2_fn = '{0}-test2.txt'.format(fetch_suff(sip3, 'test2.txt').file_id)
    s3_file2_fp = 'data/files/{0}'.format(s3_file2_fn)

    s3_file3_fn = '{0}-test3.txt'.format(fetch_suff(sip3, 'test3.txt').file_id)
    s3_file3_fp = 'data/files/{0}'.format(s3_file3_fn)

    sip1_bagmeta = json.loads(
        next(m.content for m in sip1.metadata
             if m.type.name == 'bagit'))['files']
    sip2_bagmeta = json.loads(
        next(m.content for m in sip2.metadata
             if m.type.name == 'bagit'))['files']
    sip3_bagmeta = json.loads(
        next(m.content for m in sip3.metadata
             if m.type.name == 'bagit'))['files']

    # Check if Bagit metadata contains the correct file-fetching information
    assert set([f['filepath'] for f in sip1_bagmeta]) == \
        set([s1_file1_fp,
             s1_file2_fp,
             'data/filenames.txt',
             'data/metadata/record-json.json', 'bag-info.txt',
             'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt'])
    assert not BagItArchiver._is_fetched(get_m_item(sip1_bagmeta, s1_file1_fp))
    assert not BagItArchiver._is_fetched(get_m_item(sip1_bagmeta, s1_file2_fp))

    assert set([f['filepath'] for f in sip2_bagmeta]) == \
        set([s1_file1_fp,
             s1_file2_fp,
             'data/filenames.txt',
             'data/metadata/record-json.json', 'bag-info.txt',
             'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt',
             'fetch.txt'])
    # Both files should be fetched since it's only metadata-edit submission
    assert BagItArchiver._is_fetched(get_m_item(sip2_bagmeta, s1_file1_fp))
    assert BagItArchiver._is_fetched(get_m_item(sip2_bagmeta, s1_file2_fp))

    assert set([f['filepath'] for f in sip3_bagmeta]) == \
        set([s3_file2_fp,
             s3_file3_fp,
             'data/filenames.txt',
             'data/metadata/record-json.json', 'bag-info.txt',
             'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt',
             'fetch.txt'])

    # First file should be fetched from previous version and new file should
    # be archived in this bag.
    assert BagItArchiver._is_fetched(get_m_item(sip3_bagmeta, s3_file2_fp))
    assert not BagItArchiver._is_fetched(get_m_item(sip3_bagmeta, s3_file3_fp))
    archiver1 = BagItArchiver(sip1)
    archiver2 = BagItArchiver(sip2)
    archiver3 = BagItArchiver(sip3)

    # Each archiver subpath follows: '<recid>/r/<ISO-8601-SIP-timestamp>'
    sip1_ts = arrow.get(sip1.model.created).isoformat()
    sip2_ts = arrow.get(sip2.model.created).isoformat()
    sip3_ts = arrow.get(sip3.model.created).isoformat()
    assert archiver1.get_archive_subpath() == '2/r/{0}'.format(sip1_ts)
    assert archiver2.get_archive_subpath() == '2/r/{0}'.format(sip2_ts)
    assert archiver3.get_archive_subpath() == '3/r/{0}'.format(sip3_ts)

    # As a test, write the SIPs in reverse chronological order
    assert not sip1.archived
    assert not sip2.archived
    assert not sip3.archived
    archive_sip.delay(sip3.id)
    archive_sip.delay(sip2.id)
    archive_sip.delay(sip1.id)
    assert sip1.archived
    assert sip2.archived
    assert sip3.archived

    fs1 = archive_fs.opendir(archiver1.get_archive_subpath())
    assert set(fs1.listdir()) == set([
        'tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt',
        'data'
    ])
    assert set(fs1.listdir('data')) == set(
        ['metadata', 'files', 'filenames.txt'])
    assert fs1.listdir('data/metadata') == [
        'record-json.json',
    ]
    assert set(fs1.listdir('data/files')) == set([s1_file1_fn, s1_file2_fn])

    fs2 = archive_fs.opendir(archiver2.get_archive_subpath())
    assert set(fs2.listdir()) == set([
        'tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt',
        'data', 'fetch.txt'
    ])
    # Second SIP has written only the metadata,
    # because of that There should be no 'files/', but 'filesnames.txt' should
    # still be there becasue of the fetch.txt
    assert set(fs2.listdir('data')) == set(['metadata', 'filenames.txt'])
    assert fs2.listdir('data/metadata') == [
        'record-json.json',
    ]

    with fs2.open('fetch.txt') as fp:
        cnt = fp.read().splitlines()
    # Fetched files should correctly fetch the files from the first archive
    base_uri = archiver1.get_archive_base_uri()
    assert set(cnt) == set([
        '{base}/2/r/{s1ts}/{fn} 4 {fn}'.format(fn=s1_file1_fp,
                                               base=base_uri,
                                               s1ts=sip1_ts),
        '{base}/2/r/{s1ts}/{fn} 8 {fn}'.format(fn=s1_file2_fp,
                                               base=base_uri,
                                               s1ts=sip1_ts),
    ])

    fs3 = archive_fs.opendir(archiver3.get_archive_subpath())
    assert set(fs3.listdir()) == set([
        'tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt',
        'data', 'fetch.txt'
    ])
    # Third SIP should write only the extra 'test3.txt' file
    assert set(fs3.listdir('data')) == set(
        ['metadata', 'files', 'filenames.txt'])
    assert fs3.listdir('data/metadata') == [
        'record-json.json',
    ]
    assert fs3.listdir('data/files') == [
        s3_file3_fn,
    ]
    with fs3.open('fetch.txt') as fp:
        cnt = fp.read().splitlines()
    # Since 'file.txt' was removed in third SIP, we should only fetch the
    # 'test2.txt', also from the first archive, since that's where this
    # file resides physically.
    base_uri = archiver1.get_archive_base_uri()
    assert set(cnt) == set([
        '{base}/2/r/{s1ts}/{fn} 8 {fn}'.format(fn=s3_file2_fp,
                                               base=base_uri,
                                               s1ts=sip1_ts),
    ])
    app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] = orig
    def __init__(self, sip, data_dir='data/files',
                 metadata_dir='data/metadata', extra_dir='', patch_of=None,
                 include_all_previous=False, tags=None,
                 filenames_mapping_file='data/filenames.txt'):
        """Constructor of the BagIt Archiver.

        When specifying 'patch_of' parameter the 'include_all_previous'
        flag determines whether files that are missing in the archived SIP
        (w.r.t. the SIP specified in 'patch_of') should be treated as
        explicitly deleted (include_all_previous=False) or if they
        should still be included in the manifest.

        Example:
            include_all_previous = True
              SIP_1:
                SIPFiles: a.txt, b.txt
                BagIt Manifest: a.txt, b.txt
              SIP_2 (Bagged with patch_of=SIP_1):
                SIPFiles: b.txt, c.txt
                BagIt Manifest: a.txt, b.txt, c.txt
                fetch.txt: a.txt, b.txt

            include_all_previous = False
              SIP_1:
                SIPFiles: a.txt, b.txt
                BagIt Manifest: a.txt, b.txt
              SIP_2 (Bagged with patch_of=SIP_1):
                SIPFIles: b.txt, c.txt
                BagIt Manifest: b.txt, c.txt
                fetch.txt: b.txt

        :param sip: API instance of the SIP that is to be archived.
        :type sip: :py:class:`invenio_sipstore.api.SIP`
            or :py:class:`invenio_sipstore.models.SIP`
        :param data_dir: directory where the SIPFiles will be written.
        :param metadata_dir: directory where the SIPMetadata will be written.
        :param extra_dir: directory where all extra files will be written,
            including the BagIt-specific files.
        :param patch_of: Write a 'lightweight' bag, which will archive only
            the new SIPFiles, and refer to the repeated ones in "fetch.txt"
            file. The provided argument is a SIP API, which will be taken as a
            base for determining the "diff" between two bags.
        :type patch_of: :py:class:`invenio_sipstore.api.SIP`
            or :py:class:`invenio_sipstore.models.SIP`
        :type bool include_missing_files: If set to True and if 'patch_of' is
            used, include the files that are missing in the SIP w.r.t. to
            the 'patch_of' SIP in the manifest.
            The opposite (include_missing_files=False) is equivalent to
            treating those as explicitly deleted - the files will not be
            included in the manifest, nor in the "fetch.txt" file.
        :param tags: a list of 2-tuple containing the tags of the bagit,
            which will be written to the 'bag-info.txt' file.
        :param filenames_mapping_file: filepath of the file in the archive
            which contains all of SIPFile mappings. If this parameter is
            boolean-resolvable as False, the file will not be created.
        """
        super(BagItArchiver, self).__init__(
            sip, data_dir=data_dir, metadata_dir=metadata_dir,
            extra_dir=extra_dir, filenames_mapping_file=filenames_mapping_file)
        self.tags = tags or current_app.config['SIPSTORE_BAGIT_TAGS']
        self.patch_of = (patch_of if isinstance(patch_of, SIP)
                         else SIP(patch_of)) if patch_of else None
        self.include_all_previous = include_all_previous
def sips(db, locations, sip_metadata_types):
    """Fixture for the SIP objects sharing multiple files.

    Four SIPs are sharing three files in the following way:
    SIP-1: File1
    SIP-2: File1, File2
    SIP-3: File2(renamed on SIPFile, but same FileInstance), File3
    SIP-4: File4, File5, File6
    """
    # A SIP with agent info
    sip1 = SIP.create(agent={
        'email': '*****@*****.**',
        'orcid': '1111-1111-1111-1111',
        'ip_address': '1.1.1.1'
    })
    sip1api = SIPApi(sip1)
    sip1api.attach_metadata('marcxml-test', '<p>XML 1</p>')
    sip1api.attach_metadata('json-test', '{"title": "JSON 1"}')
    # Metadata 'txt-test', although attached should not be archived
    # (see conftest configuration)
    sip1api.attach_metadata('txt-test', 'Title: TXT 1')
    file1 = FileInstance.create()
    file1.set_contents(BytesIO(b('test')),
                       default_location=locations['default'].uri)
    sip1file1 = SIPFile(sip_id=sip1.id, filepath="foobar.txt",
                        file_id=file1.id)

    db_.session.add(sip1file1)

    sip2 = SIP.create()
    sip2api = SIPApi(sip2)
    sip2api.attach_metadata('marcxml-test', '<p>XML 2</p>')
    sip2api.attach_metadata('json-test', '{"title": "JSON 2"}')
    file2 = FileInstance.create()
    file2.set_contents(BytesIO(b'test-second'),
                       default_location=locations['default'].uri)
    sip2file1 = SIPFile(sip_id=sip2.id, filepath="foobar.txt",
                        file_id=file1.id)
    sip2file2 = SIPFile(sip_id=sip2.id, filepath="foobar2.txt",
                        file_id=file2.id)

    db_.session.add(sip2file1)
    db_.session.add(sip2file2)

    sip3 = SIP.create()
    sip3api = SIPApi(sip3)
    sip3api.attach_metadata('marcxml-test', '<p>XML 3</p>')
    sip3api.attach_metadata('json-test', '{"title": "JSON 3"}')
    file3 = FileInstance.create()
    file3.set_contents(BytesIO(b'test-third'),
                       default_location=locations['default'].uri)
    sip3file2 = SIPFile(sip_id=sip3.id, filepath="foobar2-renamed.txt",
                        file_id=file2.id)
    sip3file3 = SIPFile(sip_id=sip3.id, filepath="foobar3.txt",
                        file_id=file3.id)

    db_.session.add(sip3file2)
    db_.session.add(sip3file3)

    # A SIP with naughty filenames
    sip4 = SIP.create()
    sip4api = SIPApi(sip4)
    sip4api.attach_metadata('marcxml-test', '<p>XML 4 żółć</p>')
    sip4api.attach_metadata('json-test', '{"title": "JSON 4 żółć"}')
    file4 = FileInstance.create()
    file4.set_contents(BytesIO('test-fourth żółć'.encode('utf-8')),
                       default_location=locations['default'].uri)
    file5 = FileInstance.create()
    file5.set_contents(BytesIO('test-fifth ąęćźə'.encode('utf-8')),
                       default_location=locations['default'].uri)

    file6 = FileInstance.create()
    file6.set_contents(BytesIO('test-sixth π'.encode('utf-8')),
                       default_location=locations['default'].uri)
    sip5file4 = SIPFile(sip_id=sip4.id, filepath="../../foobar.txt",
                        file_id=file4.id)

    sip5file5 = SIPFile(sip_id=sip4.id,
                        filepath="http://maliciouswebsite.com/hack.js",
                        file_id=file5.id)

    sip5file6 = SIPFile(sip_id=sip4.id,
                        filepath="łóżźćąę.dat",
                        file_id=file6.id)

    db_.session.add(sip5file4)
    db_.session.add(sip5file5)
    db_.session.add(sip5file6)

    # A SIP with metadata-only changes
    sip5 = SIP.create()
    sip5api = SIPApi(sip5)
    sip5api.attach_metadata('marcxml-test', '<p>XML 5 Meta Only</p>')

    db_.session.commit()
    return [sip1api, sip2api, sip3api, sip4api, sip5api]
Exemple #26
0
def sips(db, locations, sip_metadata_types):
    """Fixture for the SIP objects sharing multiple files.

    Four SIPs are sharing three files in the following way:
    SIP-1: File1
    SIP-2: File1, File2
    SIP-3: File2(renamed on SIPFile, but same FileInstance), File3
    SIP-4: File4, File5, File6
    """
    # A SIP with agent info
    sip1 = SIP.create(
        agent={
            'email': '*****@*****.**',
            'orcid': '1111-1111-1111-1111',
            'ip_address': '1.1.1.1'
        })
    sip1api = SIPApi(sip1)
    sip1api.attach_metadata('marcxml-test', '<p>XML 1</p>')
    sip1api.attach_metadata('json-test', '{"title": "JSON 1"}')
    # Metadata 'txt-test', although attached should not be archived
    # (see conftest configuration)
    sip1api.attach_metadata('txt-test', 'Title: TXT 1')
    file1 = FileInstance.create()
    file1.set_contents(BytesIO(b('test')),
                       default_location=locations['default'].uri)
    sip1file1 = SIPFile(sip_id=sip1.id,
                        filepath="foobar.txt",
                        file_id=file1.id)

    db_.session.add(sip1file1)

    sip2 = SIP.create()
    sip2api = SIPApi(sip2)
    sip2api.attach_metadata('marcxml-test', '<p>XML 2</p>')
    sip2api.attach_metadata('json-test', '{"title": "JSON 2"}')
    file2 = FileInstance.create()
    file2.set_contents(BytesIO(b'test-second'),
                       default_location=locations['default'].uri)
    sip2file1 = SIPFile(sip_id=sip2.id,
                        filepath="foobar.txt",
                        file_id=file1.id)
    sip2file2 = SIPFile(sip_id=sip2.id,
                        filepath="foobar2.txt",
                        file_id=file2.id)

    db_.session.add(sip2file1)
    db_.session.add(sip2file2)

    sip3 = SIP.create()
    sip3api = SIPApi(sip3)
    sip3api.attach_metadata('marcxml-test', '<p>XML 3</p>')
    sip3api.attach_metadata('json-test', '{"title": "JSON 3"}')
    file3 = FileInstance.create()
    file3.set_contents(BytesIO(b'test-third'),
                       default_location=locations['default'].uri)
    sip3file2 = SIPFile(sip_id=sip3.id,
                        filepath="foobar2-renamed.txt",
                        file_id=file2.id)
    sip3file3 = SIPFile(sip_id=sip3.id,
                        filepath="foobar3.txt",
                        file_id=file3.id)

    db_.session.add(sip3file2)
    db_.session.add(sip3file3)

    # A SIP with naughty filenames
    sip4 = SIP.create()
    sip4api = SIPApi(sip4)
    sip4api.attach_metadata('marcxml-test', '<p>XML 4 żółć</p>')
    sip4api.attach_metadata('json-test', '{"title": "JSON 4 żółć"}')
    file4 = FileInstance.create()
    file4.set_contents(BytesIO('test-fourth żółć'.encode('utf-8')),
                       default_location=locations['default'].uri)
    file5 = FileInstance.create()
    file5.set_contents(BytesIO('test-fifth ąęćźə'.encode('utf-8')),
                       default_location=locations['default'].uri)

    file6 = FileInstance.create()
    file6.set_contents(BytesIO('test-sixth π'.encode('utf-8')),
                       default_location=locations['default'].uri)
    sip5file4 = SIPFile(sip_id=sip4.id,
                        filepath="../../foobar.txt",
                        file_id=file4.id)

    sip5file5 = SIPFile(sip_id=sip4.id,
                        filepath="http://maliciouswebsite.com/hack.js",
                        file_id=file5.id)

    sip5file6 = SIPFile(sip_id=sip4.id,
                        filepath="łóżźćąę.dat",
                        file_id=file6.id)

    db_.session.add(sip5file4)
    db_.session.add(sip5file5)
    db_.session.add(sip5file6)

    # A SIP with metadata-only changes
    sip5 = SIP.create()
    sip5api = SIPApi(sip5)
    sip5api.attach_metadata('marcxml-test', '<p>XML 5 Meta Only</p>')

    db_.session.commit()
    return [sip1api, sip2api, sip3api, sip4api, sip5api]