def test_get_checksum():
    """Test the function _get_checksum."""
    with pytest.raises(AttributeError):
        BagItArchiver._get_checksum('sha1:12')
    with pytest.raises(AttributeError):
        BagItArchiver._get_checksum('md5')
    assert BagItArchiver._get_checksum('md5:12') == '12'
Exemple #2
0
def archive_sip(sip_uuid):
    """Send the SIP for archiving.

    Retries every 4 hours, six times, which should work for up to 24 hours
    archiving system downtime.

    :param sip_uuid: UUID of the SIP for archiving.
    :type sip_uuid: str
    """
    try:
        sip = SIPApi(SIP.query.get(sip_uuid))
        archiver = BagItArchiver(sip)
        bagmeta = archiver.get_bagit_metadata(sip)
        if bagmeta is None:
            raise ArchivingError(
                'Bagit metadata does not exist for SIP: {0}.'.format(sip.id))
        if sip.archived:
            raise ArchivingError(
                'SIP was already archived {0}.'.format(sip.id))
        archiver.write_all_files()
        sip.archived = True
        db.session.commit()
    except Exception as exc:
        # On ArchivingError (see above), do not retry, but re-raise
        if not isinstance(exc, ArchivingError):
            archive_sip.retry(exc=exc)
        raise
def test_constructor(sips):
    """Test the archiver constructor."""
    s = BaseArchiver(sips[0].model).sip
    s2 = BaseArchiver(sips[0]).sip
    assert isinstance(s, SIPApi)
    assert isinstance(s2, SIPApi)

    a = BagItArchiver(sips[1], patch_of=sips[0])
    a2 = BagItArchiver(sips[1].model, patch_of=sips[0].model)
    assert isinstance(a.sip, SIPApi)
    assert isinstance(a.patch_of, SIPApi)
    assert isinstance(a2.sip, SIPApi)
    assert isinstance(a2.patch_of, SIPApi)
def test_write_all_files(sips, archive_fs):
    """Test the functions used to create an export of the SIP."""
    sip = sips[0]
    archiver = BagItArchiver(sip)
    assert not len(archive_fs.listdir())
    archiver.write_all_files()
    assert len(archive_fs.listdir()) == 1
    fs = archive_fs.opendir(archiver.get_archive_subpath())
    assert set(fs.listdir()) == \
        set(['tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt',
             'bag-info.txt', 'data', ])
    assert set(fs.listdir('data')) == \
        set(['metadata', 'files', 'filenames.txt'])
    assert set(fs.listdir('data/metadata')) == \
        set(['marcxml-test.xml', 'json-test.json', ])
    assert set(fs.listdir('data/files')) == set(['foobar.txt', ])
Exemple #5
0
    def publish(self,
                pid=None,
                id_=None,
                user_id=None,
                sip_agent=None,
                spam_check=True):
        """Publish the Zenodo deposit."""
        self['owners'] = self['_deposit']['owners']
        self.validate_publish()
        if spam_check:
            self.spam_check()

        is_first_publishing = not self.is_published()

        deposit = super(ZenodoDeposit, self).publish(pid, id_)
        recid, record = deposit.fetch_published()

        pv = PIDVersioning(child=recid)
        is_new_version = pv.children.count() > 1
        # a) Fetch the last SIP from the previous version if it's a new version
        # b) Fetch the previous SIP if publishing the metadata edit
        if is_new_version or (not is_first_publishing):
            if is_new_version:
                sip_recid = pv.children.all()[-2]
            else:  # (not is_first_publishing)
                sip_recid = recid
            # Get the last SIP of the relevant recid, i.e.: either last
            # version or the current one
            sip_patch_of = (db.session.query(SIPModel).join(
                RecordSIPModel, RecordSIPModel.sip_id == SIPModel.id).filter(
                    RecordSIPModel.pid_id == sip_recid.id).order_by(
                        SIPModel.created.desc()).first())
        else:
            sip_patch_of = None

        recordsip = RecordSIP.create(recid,
                                     record,
                                     archivable=True,
                                     create_sip_files=is_first_publishing,
                                     user_id=user_id,
                                     agent=sip_agent)
        archiver = BagItArchiver(
            recordsip.sip,
            include_all_previous=(not is_first_publishing),
            patch_of=sip_patch_of)
        archiver.save_bagit_metadata()
        return deposit
Exemple #6
0
    def publish(self, pid=None, id_=None, user_id=None, sip_agent=None):
        """Publish the Zenodo deposit."""
        self['owners'] = self['_deposit']['owners']
        self.validate_publish()
        is_first_publishing = not self.is_published()

        deposit = super(ZenodoDeposit, self).publish(pid, id_)
        recid, record = deposit.fetch_published()

        pv = PIDVersioning(child=recid)
        is_new_version = pv.children.count() > 1
        # a) Fetch the last SIP from the previous version if it's a new version
        # b) Fetch the previous SIP if publishing the metadata edit
        if is_new_version or (not is_first_publishing):
            if is_new_version:
                sip_recid = pv.children.all()[-2]
            else:  # (not is_first_publishing)
                sip_recid = recid
            # Get the last SIP of the relevant recid, i.e.: either last
            # version or the current one
            sip_patch_of = (
                db.session.query(SIPModel)
                .join(RecordSIPModel, RecordSIPModel.sip_id == SIPModel.id)
                .filter(RecordSIPModel.pid_id == sip_recid.id)
                .order_by(SIPModel.created.desc())
                .first()
            )
        else:
            sip_patch_of = None

        recordsip = RecordSIP.create(
            recid, record, archivable=True,
            create_sip_files=is_first_publishing, user_id=user_id,
            agent=sip_agent)
        archiver = BagItArchiver(
            recordsip.sip, include_all_previous=(not is_first_publishing),
            patch_of=sip_patch_of)
        archiver.save_bagit_metadata()
        return deposit
def test_save_bagit_metadata(sips):
    """Test saving of bagit metadata."""
    sip = sips[0]
    assert not BagItArchiver.get_bagit_metadata(sip)
    archiver = BagItArchiver(sip)
    archiver.save_bagit_metadata()
    bmeta = BagItArchiver.get_bagit_metadata(sip, as_dict=True)
    file_m = next(f for f in bmeta['files'] if 'sipfilepath' in f)
    assert file_m['sipfilepath'] == 'foobar.txt'
    assert file_m['filepath'] == 'data/files/foobar.txt'

    sip.model.sip_files[0].filepath = 'changed.txt'
    with pytest.raises(Exception) as excinfo:
        archiver.save_bagit_metadata()
    assert 'Attempting to save' in str(excinfo.value)
    archiver.save_bagit_metadata(overwrite=True)
    bmeta = BagItArchiver.get_bagit_metadata(sip, as_dict=True)

    file_m = next(f for f in bmeta['files'] if 'sipfilepath' in f)
    assert file_m['sipfilepath'] == 'changed.txt'
    assert file_m['filepath'] == 'data/files/changed.txt'
def test_get_all_files(sips):
    """Test the function get_all_files."""
    archiver = BagItArchiver(sips[0])
    files = archiver.get_all_files()
    assert len(files) == 8
def test_write_patched(mocker, sips, archive_fs,
                       secure_sipfile_name_formatter):
    """Test the BagIt archiving with previous SIP as a base."""
    # Mock the bagging date generation so the 'Bagging-Date' tag is predefined
    dt = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")
    mocker.patch('invenio_sipstore.archivers.bagit_archiver.BagItArchiver.'
                 '_generate_bagging_date', return_value=dt)

    arch1 = BagItArchiver(sips[0])
    arch1.write_all_files()
    arch2 = BagItArchiver(sips[1], patch_of=sips[0])
    arch2.write_all_files()
    arch3 = BagItArchiver(sips[2], patch_of=sips[1],
                          include_all_previous=True)
    arch3.write_all_files()
    arch5 = BagItArchiver(sips[4], patch_of=sips[2],
                          include_all_previous=True)
    arch5.write_all_files()
    # NOTE: We take only SIP-1, SIP-2, SIP-3 and SIP-5.
    # Enumeration of related objects follows the "sips" fixture naming
    fs1 = archive_fs.opendir(arch1.get_archive_subpath())
    fs2 = archive_fs.opendir(arch2.get_archive_subpath())
    fs3 = archive_fs.opendir(arch3.get_archive_subpath())
    fs5 = archive_fs.opendir(arch5.get_archive_subpath())
    assert len(fs1.listdir()) == 5
    assert len(fs2.listdir()) == 6  # Includes 'fetch.txt'
    assert len(fs3.listdir()) == 6  # Includes 'fetch.txt'
    assert len(fs5.listdir()) == 6  # Includes 'fetch.txt'

    # Check SIP-1,2,3,5 data contents
    assert set(fs1.listdir('data')) == \
        set(['files', 'metadata', 'filenames.txt'])
    assert len(fs1.listdir('data/files')) == 1
    assert len(fs1.listdir('data/metadata')) == 2

    assert set(fs2.listdir('data')) == \
        set(['files', 'metadata', 'filenames.txt'])
    assert len(fs2.listdir('data/files')) == 1
    assert len(fs2.listdir('data/metadata')) == 2

    assert set(fs3.listdir('data')) == \
        set(['files', 'metadata', 'filenames.txt'])
    assert len(fs3.listdir('data/files')) == 1
    assert len(fs3.listdir('data/metadata')) == 2

    assert set(fs5.listdir('data')) == \
        set(['metadata', 'filenames.txt'])
    assert len(fs5.listdir('data/metadata')) == 1

    # Fetch the filenames for easier fixture formatting below
    file1_fn = '{0}-foobar.txt'.format(
        fetch_file_endswith(sips[0], 'foobar.txt').file_id)
    file2_fn = '{0}-foobar2.txt'.format(
        fetch_file_endswith(sips[1], 'foobar2.txt').file_id)
    file3_fn = '{0}-foobar3.txt'.format(
        fetch_file_endswith(sips[2], 'foobar3.txt').file_id)
    file2_rn_fn = '{0}-foobar2-renamed.txt'.format(
        fetch_file_endswith(sips[2], 'foobar2-renamed.txt').file_id)

    assert file2_fn[:36] == file2_rn_fn[:36]
    # Both file2_fn and file2_rn_fn are referring to the same FileInstance,
    # so their UUID prefix should match
    expected_sip1 = [
        ('data/files/{0}'.format(file1_fn), 'test'),
        ('data/metadata/marcxml-test.xml', '<p>XML 1</p>'),
        ('data/metadata/json-test.json', '{"title": "JSON 1"}'),
        ('bagit.txt',
            'BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8'),
        ('manifest-md5.txt', set([
            "{checksum} {filepath}".format(
                **_read_file(fs1, 'data/files/{0}'.format(file1_fn))),
            "{checksum} {filepath}".format(
                **_read_file(fs1, 'data/metadata/marcxml-test.xml')),
            "{checksum} {filepath}".format(
                **_read_file(fs1, 'data/metadata/json-test.json')),
            "{checksum} {filepath}".format(
                **_read_file(fs1, 'data/filenames.txt')),
        ])),
        ('data/filenames.txt', set([
            '{0} foobar.txt'.format(file1_fn),
        ])),
        ('bag-info.txt', (
            "Source-Organization: European Organization for Nuclear Research\n"
            "Organization-Address: CERN, CH-1211 Geneva 23, Switzerland\n"
            "Bagging-Date: {0}\n".format(dt) +
            "Payload-Oxum: 93.4\n"
            "External-Identifier: {0}/SIPBagIt-v1.0.0\n".format(sips[0].id) +
            "External-Description: BagIt archive of SIP."
        )),
    ]
    expected_sip2 = [
        ('data/files/{0}'.format(file2_fn), 'test-second'),
        ('data/metadata/marcxml-test.xml', '<p>XML 2</p>'),
        ('data/metadata/json-test.json', '{"title": "JSON 2"}'),
        ('bagit.txt',
            'BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8'),
        ('fetch.txt', set(["{0} {1} {2}".format(
            fs1.getsyspath('data/files/{0}'.format(file1_fn)),
            4, 'data/files/{0}'.format(file1_fn)),
        ])),
        ('manifest-md5.txt', set([
            "{checksum} {filepath}".format(
                **_read_file(fs1, 'data/files/{0}'.format(file1_fn))),
            "{checksum} {filepath}".format(
                **_read_file(fs2, 'data/files/{0}'.format(file2_fn))),
            "{checksum} {filepath}".format(
                **_read_file(fs2, 'data/metadata/marcxml-test.xml')),
            "{checksum} {filepath}".format(
                **_read_file(fs2, 'data/metadata/json-test.json')),
            "{checksum} {filepath}".format(
                **_read_file(fs2, 'data/filenames.txt')),
        ])),
        ('data/filenames.txt', set([
            '{0} foobar.txt'.format(file1_fn),
            '{0} foobar2.txt'.format(file2_fn),
        ])),
        ('bag-info.txt', (
            "Source-Organization: European Organization for Nuclear Research\n"
            "Organization-Address: CERN, CH-1211 Geneva 23, Switzerland\n"
            "Bagging-Date: {0}\n".format(dt) +
            "Payload-Oxum: 165.5\n"
            "External-Identifier: {0}/SIPBagIt-v1.0.0\n".format(sips[1].id) +
            "External-Description: BagIt archive of SIP."
        )),
    ]
    expected_sip3 = [
        ('data/files/{0}'.format(file3_fn), 'test-third'),
        ('data/metadata/marcxml-test.xml', '<p>XML 3</p>'),
        ('data/metadata/json-test.json', '{"title": "JSON 3"}'),
        ('bagit.txt',
            'BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8'),
        ('fetch.txt', set([
            "{0} {1} {2}".format(
                fs1.getsyspath('data/files/{0}'.format(file1_fn)),
                4, 'data/files/{0}'.format(file1_fn)),
            # Explanation on entry below: The file is fetched using original
            # filename (file2_fn) as it will be archived in SIP-2, however
            # the new destination has the 'renamed' filename (file2_rn_fn).
            # This is correct and expected behaviour
            "{0} {1} {2}".format(
                fs2.getsyspath('data/files/{0}'.format(file2_fn)),
                11, 'data/files/{0}'.format(file2_rn_fn)),
        ])),
        ('manifest-md5.txt', set([
            "{checksum} {filepath}".format(
                **_read_file(fs1, 'data/files/{0}'.format(file1_fn))),
            # Manifest also specifies the renamed filename for File-2
            "{checksum} data/files/{newfilename}".format(
                newfilename=file2_rn_fn,
                **_read_file(fs2, 'data/files/{0}'.format(file2_fn))),
            "{checksum} {filepath}".format(
                **_read_file(fs3, 'data/files/{0}'.format(file3_fn))),
            "{checksum} {filepath}".format(
                **_read_file(fs3, 'data/metadata/marcxml-test.xml')),
            "{checksum} {filepath}".format(
                **_read_file(fs3, 'data/metadata/json-test.json')),
            "{checksum} {filepath}".format(
                **_read_file(fs3, 'data/filenames.txt')),
        ])),
        ('data/filenames.txt', set([
            '{0} foobar.txt'.format(file1_fn),
            '{0} foobar2.txt'.format(file2_fn),
            '{0} foobar3.txt'.format(file3_fn),
        ])),
        ('bag-info.txt', (
            "Source-Organization: European Organization for Nuclear Research\n"
            "Organization-Address: CERN, CH-1211 Geneva 23, Switzerland\n"
            "Bagging-Date: {0}\n".format(dt) +
            "Payload-Oxum: 236.6\n"
            "External-Identifier: {0}/SIPBagIt-v1.0.0\n".format(sips[2].id) +
            "External-Description: BagIt archive of SIP."
        )),
    ]

    expected_sip5 = [
        ('data/metadata/marcxml-test.xml', '<p>XML 5 Meta Only</p>'),
        ('bagit.txt',
            'BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8'),
        ('fetch.txt', set([
            "{0} {1} {2}".format(
                fs1.getsyspath('data/files/{0}'.format(file1_fn)),
                4, 'data/files/{0}'.format(file1_fn)),
            # As in "expected_sip3" above, the file is fetched using original
            # filename (file2_fn) as it will be archived in SIP-2, however
            # the new destination has the 'renamed' filename (file2_rn_fn).
            # This is correct and expected behaviour
            "{0} {1} {2}".format(
                fs2.getsyspath('data/files/{0}'.format(file2_fn)),
                11, 'data/files/{0}'.format(file2_rn_fn)),
            "{0} {1} {2}".format(
                fs3.getsyspath('data/files/{0}'.format(file3_fn)),
                10, 'data/files/{0}'.format(file3_fn)),
        ])),
        ('manifest-md5.txt', set([
            "{checksum} {filepath}".format(
                **_read_file(fs1, 'data/files/{0}'.format(file1_fn))),
            # Manifest also specifies the renamed filename for File-2
            "{checksum} data/files/{newfilename}".format(
                newfilename=file2_rn_fn,
                **_read_file(fs2, 'data/files/{0}'.format(file2_fn))),
            "{checksum} {filepath}".format(
                **_read_file(fs3, 'data/files/{0}'.format(file3_fn))),
            "{checksum} {filepath}".format(
                **_read_file(fs5, 'data/metadata/marcxml-test.xml')),
            "{checksum} {filepath}".format(
                **_read_file(fs5, 'data/filenames.txt')),
        ])),
        ('data/filenames.txt', set([
            '{0} foobar.txt'.format(file1_fn),
            '{0} foobar2.txt'.format(file2_fn),
            '{0} foobar3.txt'.format(file3_fn),
        ])),
        ('bag-info.txt', (
            "Source-Organization: European Organization for Nuclear Research\n"
            "Organization-Address: CERN, CH-1211 Geneva 23, Switzerland\n"
            "Bagging-Date: {0}\n".format(dt) +
            "Payload-Oxum: 227.5\n"
            "External-Identifier: {0}/SIPBagIt-v1.0.0\n".format(sips[4].id) +
            "External-Description: BagIt archive of SIP."
        )),
    ]

    for fs, expected in [(fs1, expected_sip1),
                         (fs2, expected_sip2),
                         (fs3, expected_sip3),
                         (fs5, expected_sip5)]:
        for fn, exp_content in expected:
            with fs.open(fn) as fp:
                if isinstance(exp_content, set):
                    content = set(fp.read().splitlines())
                else:
                    content = fp.read()
            assert content == exp_content
Exemple #10
0
def test_archiving(app, db, deposit, deposit_file, locations, archive_fs):
    """Test ZenodoSIP archiving."""
    # Stash the configuration and enable writing
    orig = app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED']
    app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] = True
    deposit.files['test2.txt'] = BytesIO(b'test-two')
    deposit_v1 = publish_and_expunge(db, deposit)
    recid_v1, record_v1 = deposit_v1.fetch_published()
    recid_v1_id = recid_v1.id
    # Record files after publishing: 'test.txt', 'test2.txt'

    sip1 = SIP(SIPModel.query.one())
    sip1_id = sip1.id

    # Edit the metadata
    deposit_v1 = deposit_v1.edit()
    deposit_v1['title'] = "New title"
    deposit_v1 = publish_and_expunge(db, deposit_v1)
    # Record files after publishing: 'test.txt', 'test2.txt'
    sip2_id = SIPModel.query.order_by(SIPModel.created.desc()).first().id

    # Create a new version
    deposit_v1.newversion()
    recid_v1 = PersistentIdentifier.query.get(recid_v1_id)
    pv = PIDVersioning(child=recid_v1)
    depid_v2 = pv.draft_child_deposit
    deposit_v2 = ZenodoDeposit.get_record(depid_v2.object_uuid)
    del deposit_v2.files['test.txt']
    deposit_v2.files['test3.txt'] = BytesIO(b('test-three'))
    deposit_v2 = publish_and_expunge(db, deposit_v2)
    # Record files after publishing: 'test2.txt', 'test3.txt'

    sip1 = SIP(SIPModel.query.get(sip1_id))
    sip2 = SIP(SIPModel.query.get(sip2_id))
    sip3 = SIP(SIPModel.query.order_by(SIPModel.created.desc()).first())

    # Becase we are using secure_filename when writing SIPFiles we need to
    # genenarate the correct names: <SIPFile.id>-<secure_filename>
    s1_file1_fn = '{0}-test.txt'.format(fetch_suff(sip1, 'test.txt').file_id)
    s1_file1_fp = 'data/files/{0}'.format(s1_file1_fn)

    s1_file2_fn = '{0}-test2.txt'.format(fetch_suff(sip1, 'test2.txt').file_id)
    s1_file2_fp = 'data/files/{0}'.format(s1_file2_fn)

    s3_file2_fn = '{0}-test2.txt'.format(fetch_suff(sip3, 'test2.txt').file_id)
    s3_file2_fp = 'data/files/{0}'.format(s3_file2_fn)

    s3_file3_fn = '{0}-test3.txt'.format(fetch_suff(sip3, 'test3.txt').file_id)
    s3_file3_fp = 'data/files/{0}'.format(s3_file3_fn)

    sip1_bagmeta = json.loads(
        next(m.content for m in sip1.metadata
             if m.type.name == 'bagit'))['files']
    sip2_bagmeta = json.loads(
        next(m.content for m in sip2.metadata
             if m.type.name == 'bagit'))['files']
    sip3_bagmeta = json.loads(
        next(m.content for m in sip3.metadata
             if m.type.name == 'bagit'))['files']

    # Check if Bagit metadata contains the correct file-fetching information
    assert set([f['filepath'] for f in sip1_bagmeta]) == \
        set([s1_file1_fp,
             s1_file2_fp,
             'data/filenames.txt',
             'data/metadata/record-json.json', 'bag-info.txt',
             'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt'])
    assert not BagItArchiver._is_fetched(get_m_item(sip1_bagmeta, s1_file1_fp))
    assert not BagItArchiver._is_fetched(get_m_item(sip1_bagmeta, s1_file2_fp))

    assert set([f['filepath'] for f in sip2_bagmeta]) == \
        set([s1_file1_fp,
             s1_file2_fp,
             'data/filenames.txt',
             'data/metadata/record-json.json', 'bag-info.txt',
             'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt',
             'fetch.txt'])
    # Both files should be fetched since it's only metadata-edit submission
    assert BagItArchiver._is_fetched(get_m_item(sip2_bagmeta, s1_file1_fp))
    assert BagItArchiver._is_fetched(get_m_item(sip2_bagmeta, s1_file2_fp))

    assert set([f['filepath'] for f in sip3_bagmeta]) == \
        set([s3_file2_fp,
             s3_file3_fp,
             'data/filenames.txt',
             'data/metadata/record-json.json', 'bag-info.txt',
             'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt',
             'fetch.txt'])

    # First file should be fetched from previous version and new file should
    # be archived in this bag.
    assert BagItArchiver._is_fetched(get_m_item(sip3_bagmeta, s3_file2_fp))
    assert not BagItArchiver._is_fetched(get_m_item(sip3_bagmeta, s3_file3_fp))
    archiver1 = BagItArchiver(sip1)
    archiver2 = BagItArchiver(sip2)
    archiver3 = BagItArchiver(sip3)

    # Each archiver subpath follows: '<recid>/r/<ISO-8601-SIP-timestamp>'
    sip1_ts = arrow.get(sip1.model.created).isoformat()
    sip2_ts = arrow.get(sip2.model.created).isoformat()
    sip3_ts = arrow.get(sip3.model.created).isoformat()
    assert archiver1.get_archive_subpath() == '2/r/{0}'.format(sip1_ts)
    assert archiver2.get_archive_subpath() == '2/r/{0}'.format(sip2_ts)
    assert archiver3.get_archive_subpath() == '3/r/{0}'.format(sip3_ts)

    # As a test, write the SIPs in reverse chronological order
    assert not sip1.archived
    assert not sip2.archived
    assert not sip3.archived
    archive_sip.delay(sip3.id)
    archive_sip.delay(sip2.id)
    archive_sip.delay(sip1.id)
    assert sip1.archived
    assert sip2.archived
    assert sip3.archived

    fs1 = archive_fs.opendir(archiver1.get_archive_subpath())
    assert set(fs1.listdir()) == set([
        'tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt',
        'data'
    ])
    assert set(fs1.listdir('data')) == set(
        ['metadata', 'files', 'filenames.txt'])
    assert fs1.listdir('data/metadata') == [
        'record-json.json',
    ]
    assert set(fs1.listdir('data/files')) == set([s1_file1_fn, s1_file2_fn])

    fs2 = archive_fs.opendir(archiver2.get_archive_subpath())
    assert set(fs2.listdir()) == set([
        'tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt',
        'data', 'fetch.txt'
    ])
    # Second SIP has written only the metadata,
    # because of that There should be no 'files/', but 'filesnames.txt' should
    # still be there becasue of the fetch.txt
    assert set(fs2.listdir('data')) == set(['metadata', 'filenames.txt'])
    assert fs2.listdir('data/metadata') == [
        'record-json.json',
    ]

    with fs2.open('fetch.txt') as fp:
        cnt = fp.read().splitlines()
    # Fetched files should correctly fetch the files from the first archive
    base_uri = archiver1.get_archive_base_uri()
    assert set(cnt) == set([
        '{base}/2/r/{s1ts}/{fn} 4 {fn}'.format(fn=s1_file1_fp,
                                               base=base_uri,
                                               s1ts=sip1_ts),
        '{base}/2/r/{s1ts}/{fn} 8 {fn}'.format(fn=s1_file2_fp,
                                               base=base_uri,
                                               s1ts=sip1_ts),
    ])

    fs3 = archive_fs.opendir(archiver3.get_archive_subpath())
    assert set(fs3.listdir()) == set([
        'tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt',
        'data', 'fetch.txt'
    ])
    # Third SIP should write only the extra 'test3.txt' file
    assert set(fs3.listdir('data')) == set(
        ['metadata', 'files', 'filenames.txt'])
    assert fs3.listdir('data/metadata') == [
        'record-json.json',
    ]
    assert fs3.listdir('data/files') == [
        s3_file3_fn,
    ]
    with fs3.open('fetch.txt') as fp:
        cnt = fp.read().splitlines()
    # Since 'file.txt' was removed in third SIP, we should only fetch the
    # 'test2.txt', also from the first archive, since that's where this
    # file resides physically.
    base_uri = archiver1.get_archive_base_uri()
    assert set(cnt) == set([
        '{base}/2/r/{s1ts}/{fn} 8 {fn}'.format(fn=s3_file2_fp,
                                               base=base_uri,
                                               s1ts=sip1_ts),
    ])
    app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] = orig
Exemple #11
0
def test_archiving(app, db, deposit, deposit_file, locations, archive_fs):
    """Test ZenodoSIP archiving."""
    # Stash the configuration and enable writing
    orig = app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED']
    app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] = True
    deposit.files['test2.txt'] = BytesIO(b'test-two')
    deposit_v1 = publish_and_expunge(db, deposit)
    recid_v1, record_v1 = deposit_v1.fetch_published()
    recid_v1_id = recid_v1.id
    # Record files after publishing: 'test.txt', 'test2.txt'

    sip1 = SIP(SIPModel.query.one())
    sip1_id = sip1.id

    # Edit the metadata
    deposit_v1 = deposit_v1.edit()
    deposit_v1['title'] = "New title"
    deposit_v1 = publish_and_expunge(db, deposit_v1)
    # Record files after publishing: 'test.txt', 'test2.txt'
    sip2_id = SIPModel.query.order_by(SIPModel.created.desc()).first().id

    # Create a new version
    deposit_v1.newversion()
    recid_v1 = PersistentIdentifier.query.get(recid_v1_id)
    pv = PIDVersioning(child=recid_v1)
    depid_v2 = pv.draft_child_deposit
    deposit_v2 = ZenodoDeposit.get_record(depid_v2.object_uuid)
    del deposit_v2.files['test.txt']
    deposit_v2.files['test3.txt'] = BytesIO(b('test-three'))
    deposit_v2 = publish_and_expunge(db, deposit_v2)
    # Record files after publishing: 'test2.txt', 'test3.txt'

    sip1 = SIP(SIPModel.query.get(sip1_id))
    sip2 = SIP(SIPModel.query.get(sip2_id))
    sip3 = SIP(SIPModel.query.order_by(SIPModel.created.desc()).first())

    # Becase we are using secure_filename when writing SIPFiles we need to
    # genenarate the correct names: <SIPFile.id>-<secure_filename>
    s1_file1_fn = '{0}-test.txt'.format(fetch_suff(sip1, 'test.txt').file_id)
    s1_file1_fp = 'data/files/{0}'.format(s1_file1_fn)

    s1_file2_fn = '{0}-test2.txt'.format(fetch_suff(sip1, 'test2.txt').file_id)
    s1_file2_fp = 'data/files/{0}'.format(s1_file2_fn)

    s3_file2_fn = '{0}-test2.txt'.format(fetch_suff(sip3, 'test2.txt').file_id)
    s3_file2_fp = 'data/files/{0}'.format(s3_file2_fn)

    s3_file3_fn = '{0}-test3.txt'.format(fetch_suff(sip3, 'test3.txt').file_id)
    s3_file3_fp = 'data/files/{0}'.format(s3_file3_fn)

    sip1_bagmeta = json.loads(next(
        m.content for m in sip1.metadata if m.type.name == 'bagit'))['files']
    sip2_bagmeta = json.loads(next(
        m.content for m in sip2.metadata if m.type.name == 'bagit'))['files']
    sip3_bagmeta = json.loads(next(
        m.content for m in sip3.metadata if m.type.name == 'bagit'))['files']

    # Check if Bagit metadata contains the correct file-fetching information
    assert set([f['filepath'] for f in sip1_bagmeta]) == \
        set([s1_file1_fp,
             s1_file2_fp,
             'data/filenames.txt',
             'data/metadata/record-json.json', 'bag-info.txt',
             'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt'])
    assert not BagItArchiver._is_fetched(
        get_m_item(sip1_bagmeta, s1_file1_fp))
    assert not BagItArchiver._is_fetched(
        get_m_item(sip1_bagmeta, s1_file2_fp))

    assert set([f['filepath'] for f in sip2_bagmeta]) == \
        set([s1_file1_fp,
             s1_file2_fp,
             'data/filenames.txt',
             'data/metadata/record-json.json', 'bag-info.txt',
             'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt',
             'fetch.txt'])
    # Both files should be fetched since it's only metadata-edit submission
    assert BagItArchiver._is_fetched(
        get_m_item(sip2_bagmeta, s1_file1_fp))
    assert BagItArchiver._is_fetched(
        get_m_item(sip2_bagmeta, s1_file2_fp))

    assert set([f['filepath'] for f in sip3_bagmeta]) == \
        set([s3_file2_fp,
             s3_file3_fp,
             'data/filenames.txt',
             'data/metadata/record-json.json', 'bag-info.txt',
             'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt',
             'fetch.txt'])

    # First file should be fetched from previous version and new file should
    # be archived in this bag.
    assert BagItArchiver._is_fetched(
        get_m_item(sip3_bagmeta, s3_file2_fp))
    assert not BagItArchiver._is_fetched(
        get_m_item(sip3_bagmeta, s3_file3_fp))
    archiver1 = BagItArchiver(sip1)
    archiver2 = BagItArchiver(sip2)
    archiver3 = BagItArchiver(sip3)

    # Each archiver subpath follows: '<recid>/r/<ISO-8601-SIP-timestamp>'
    sip1_ts = arrow.get(sip1.model.created).isoformat()
    sip2_ts = arrow.get(sip2.model.created).isoformat()
    sip3_ts = arrow.get(sip3.model.created).isoformat()
    assert archiver1.get_archive_subpath() == '2/r/{0}'.format(sip1_ts)
    assert archiver2.get_archive_subpath() == '2/r/{0}'.format(sip2_ts)
    assert archiver3.get_archive_subpath() == '3/r/{0}'.format(sip3_ts)

    # As a test, write the SIPs in reverse chronological order
    assert not sip1.archived
    assert not sip2.archived
    assert not sip3.archived
    archive_sip.delay(sip3.id)
    archive_sip.delay(sip2.id)
    archive_sip.delay(sip1.id)
    assert sip1.archived
    assert sip2.archived
    assert sip3.archived

    fs1 = archive_fs.opendir(archiver1.get_archive_subpath())
    assert set(fs1.listdir()) == set(['tagmanifest-md5.txt', 'bagit.txt',
                                      'manifest-md5.txt', 'bag-info.txt',
                                      'data'])
    assert set(fs1.listdir('data')) == set(['metadata', 'files',
                                            'filenames.txt'])
    assert fs1.listdir('data/metadata') == ['record-json.json', ]
    assert set(fs1.listdir('data/files')) == set([s1_file1_fn, s1_file2_fn])

    fs2 = archive_fs.opendir(archiver2.get_archive_subpath())
    assert set(fs2.listdir()) == set(['tagmanifest-md5.txt', 'bagit.txt',
                                      'manifest-md5.txt', 'bag-info.txt',
                                      'data', 'fetch.txt'])
    # Second SIP has written only the metadata,
    # because of that There should be no 'files/', but 'filesnames.txt' should
    # still be there becasue of the fetch.txt
    assert set(fs2.listdir('data')) == set(['metadata', 'filenames.txt'])
    assert fs2.listdir('data/metadata') == ['record-json.json', ]

    with fs2.open('fetch.txt') as fp:
        cnt = fp.read().splitlines()
    # Fetched files should correctly fetch the files from the first archive
    base_uri = archiver1.get_archive_base_uri()
    assert set(cnt) == set([
        '{base}/2/r/{s1ts}/{fn} 4 {fn}'.format(fn=s1_file1_fp, base=base_uri,
                                               s1ts=sip1_ts),
        '{base}/2/r/{s1ts}/{fn} 8 {fn}'.format(fn=s1_file2_fp, base=base_uri,
                                               s1ts=sip1_ts),
    ])

    fs3 = archive_fs.opendir(archiver3.get_archive_subpath())
    assert set(fs3.listdir()) == set(['tagmanifest-md5.txt', 'bagit.txt',
                                      'manifest-md5.txt', 'bag-info.txt',
                                      'data', 'fetch.txt'])
    # Third SIP should write only the extra 'test3.txt' file
    assert set(fs3.listdir('data')) == set(['metadata', 'files',
                                            'filenames.txt'])
    assert fs3.listdir('data/metadata') == ['record-json.json', ]
    assert fs3.listdir('data/files') == [s3_file3_fn, ]
    with fs3.open('fetch.txt') as fp:
        cnt = fp.read().splitlines()
    # Since 'file.txt' was removed in third SIP, we should only fetch the
    # 'test2.txt', also from the first archive, since that's where this
    # file resides physically.
    base_uri = archiver1.get_archive_base_uri()
    assert set(cnt) == set([
        '{base}/2/r/{s1ts}/{fn} 8 {fn}'.format(fn=s3_file2_fp, base=base_uri,
                                               s1ts=sip1_ts),
    ])
    app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] = orig