def test_getters(db, sips, sip_metadata_types, locations):
    """Test the constructor and the getters."""
    sip = sips[0]
    archiver = BaseArchiver(sip)
    assert archiver.get_archive_base_uri() == locations['archive'].uri
    assert archiver.sip is sip
    # get data files
    data_files_info = archiver._get_data_files()
    sip_id = str(sip.id)
    abs_path_fmt = "{root}/{c1}/{c2}/{cn}/".format(
        root=locations['archive'].uri, c1=sip_id[:2], c2=sip_id[2: 4],
        cn=sip_id[4:]) + "{filepath}"
    abs_path = abs_path_fmt.format(filepath="files/foobar.txt")
    fi = {
        'file_uuid': str(sip.files[0].file_id),
        'filepath': 'files/foobar.txt',
        'filename': 'foobar.txt',
        'sipfilepath': 'foobar.txt',
        'size': 4,
        'fullpath': abs_path,
        'checksum': 'md5:098f6bcd4621d373cade4e832627b4f6'
    }
    assert data_files_info == [fi, ]

    metafiles_info = archiver._get_metadata_files()
    assert len(metafiles_info) == 3
    m1_abs_path = abs_path_fmt.format(filepath="metadata/json-test.json")
    m2_abs_path = abs_path_fmt.format(filepath="metadata/marcxml-test.xml")
    m3_abs_path = abs_path_fmt.format(filepath="metadata/txt-test.txt")
    m1 = {
        'checksum': 'md5:da4ab7e4c4b762d8e2f3ec3b9f801b1f',
        'fullpath': m1_abs_path,
        'metadata_id': sip_metadata_types['json-test'].id,
        'filepath': 'metadata/json-test.json',
        'size': 19
    }
    m2 = {
        'checksum': 'md5:498d1ce86c2e9b9eb85f1e8105affdf6',
        'fullpath': m2_abs_path,
        'metadata_id': sip_metadata_types['marcxml-test'].id,
        'filepath': 'metadata/marcxml-test.xml',
        'size': 12
    }
    m3 = {
        'checksum': 'md5:d7aad7ac23351f42ecbf62cd637ea398',
        'fullpath': m3_abs_path,
        'metadata_id': sip_metadata_types['txt-test'].id,
        'filepath': 'metadata/txt-test.txt',
        'size': 12
    }
    assert m1 in metafiles_info
    assert m2 in metafiles_info
    assert m3 in metafiles_info

    all_files_info = archiver.get_all_files()
    assert len(all_files_info) == 4
    assert fi in all_files_info
    assert m1 in all_files_info
    assert m2 in all_files_info
    assert m3 in all_files_info
Example #2
0
def transfer_demo(uuid, config):
    """Transfer the files contained in the sip to the destination.

    Very similar to the rsync transfer. However, because of time, I use the
    VERY UNSECURE sshpass package for rsync authentication.
    DO NOT USE IN PROD!!!

    :param str uuid: the id of the sip containing files to transfer
    :param dict config: here config must be a dict with the following keys:
        - user - the SSH user
        - password_file - a path where the password is stored
        - remote - the URL or IP of the remote
        - remote_path - where to store files on the remote
        - args - the args for rsync
    """
    # we retrieve the archive and the SIP associated
    sip = SIP.get_sip(uuid)
    ark = Archive.get_from_sip(uuid)

    # we export it to the temp folder
    archiver = BaseArchiver(sip)
    archiver.write_all_files()

    # we rsync it to the remote
    src_path = archiver.get_fullpath('')
    dest_path = join(config['remote_path'], ark.accession_id)
    dest_path = '{}:{}'.format(config['remote'], dest_path)
    ssh_command = 'sshpass -f {filename} ssh -l {user}'.format(
        filename=config['password_file'], user=config['user'])
    return call([
        'rsync', config['args'], '--rsh={}'.format(ssh_command), src_path,
        dest_path
    ])
def test_constructor(sips):
    """Test the archiver constructor."""
    s = BaseArchiver(sips[0].model).sip
    s2 = BaseArchiver(sips[0]).sip
    assert isinstance(s, SIPApi)
    assert isinstance(s2, SIPApi)

    a = BagItArchiver(sips[1], patch_of=sips[0])
    a2 = BagItArchiver(sips[1].model, patch_of=sips[0].model)
    assert isinstance(a.sip, SIPApi)
    assert isinstance(a.patch_of, SIPApi)
    assert isinstance(a2.sip, SIPApi)
    assert isinstance(a2.patch_of, SIPApi)
Example #4
0
def transfer_rsync(uuid, config):
    """Transfer the files contained in the sip to the destination.

    The transfer is done with a rsync. If transfer to remote, you need a valid
    ssh setup.

    This method is automatically called by the module to transfer the files.
    Depending on your installation, you may want to have a different behavior
    (copy among servers...). Then, you can create your own factory and link it
    into the config variable
    :py:data:`invenio_archivematica.config.ARCHIVEMATICA_TRANSFER_FACTORY`.

    The config needs to include at least the destination folder. If transfer
    to remote, it needs to include the user and the server. In either cases,
    you can include usual rsync parameters. See
    :py:data:`invenio_archivematica.config.ARCHIVEMATICA_TRANSFER_FOLDER`:

    .. code-block:: python

        ARCHIVEMATICA_TRANSFER_FOLDER = {
            'server': 'localhost',
            'user': '******',
            'destination': '/tmp',
            'args': '-az'
        }

    :param str uuid: the id of the sip containing files to transfer
    :param config: the config for rsync
    """
    sip = SIP.get_sip(uuid)

    # first we copy everything in a temp folder
    archiver = BaseArchiver(sip)
    archiver.write_all_files()

    # then we rsync to the final dest
    src_path = archiver.get_fullpath('')
    dest_path = config['destination']
    if config.get('server', None) and config.get('user', None):
        dest_path = '{user}@{server}:{dest}'.format(user=config['user'],
                                                    server=config['server'],
                                                    dest=dest_path)
    try:
        ret = call(['rsync', config['args'], src_path, dest_path])
    # we remove the temp folder
    finally:
        rmtree(src_path)
    return ret
def test_write_all(db, sips, sip_metadata_types, locations, archive_fs):
    """Test the public "write_all_files" method."""
    sip = sips[0]
    archiver = BaseArchiver(sip)
    assert not archive_fs.listdir()
    archiver.write_all_files()
    assert len(archive_fs.listdir()) == 1
    fs = archive_fs.opendir(archiver.get_archive_subpath())
    assert len(fs.listdir()) == 2
    assert len(fs.listdir('metadata')) == 2
    assert len(fs.listdir('files')) == 1
    expected = {
        ('metadata/marcxml-test.xml', '<p>XML 1</p>'),
        ('metadata/json-test.json', '{"title": "JSON 1"}'),
        ('files/foobar.txt', 'test'),
    }
    for fn, content in expected:
        with fs.open(fn, 'r') as fp:
            c = fp.read()
        assert c == content
def test_write_all(db, sips, sip_metadata_types, locations, archive_fs):
    """Test the public "write_all_files" method."""
    sip = sips[0]
    archiver = BaseArchiver(sip)
    assert not archive_fs.listdir()
    archiver.write_all_files()
    assert len(archive_fs.listdir()) == 1
    fs = archive_fs.opendir(archiver.get_archive_subpath())
    assert len(fs.listdir()) == 2
    assert len(fs.listdir('metadata')) == 3
    assert len(fs.listdir('files')) == 1
    expected = {
            ('metadata/marcxml-test.xml', '<p>XML 1</p>'),
            ('metadata/json-test.json', '{"title": "JSON 1"}'),
            ('metadata/txt-test.txt', 'Title: TXT 1'),
            ('files/foobar.txt', 'test'),
    }
    for fn, content in expected:
        with fs.open(fn, 'r') as fp:
            c = fp.read()
        assert c == content
Example #7
0
def transfer_cp(uuid, config):
    """Transfer the files contained in the sip to a local destination.

    The transfer is done with a simple copy of files.

    This method is automatically called by the module to transfer the files.
    Depending on your installation, you may want to have a different behavior
    (copy among servers...). Then, you can create your own factory and link it
    into the config variable
    :py:data:`invenio_archivematica.config.ARCHIVEMATICA_TRANSFER_FACTORY`.

    :param str uuid: the id of the sip containing files to transfer
    :param config: can be empty. It will have the content of the variable
        :py:data:`invenio_archivematica.config.ARCHIVEMATICA_TRANSFER_FOLDER`.
        However, it will use the export folder set in
        :py:data:`invenio_sipstore.config.SIPSTORE_ARCHIVER_LOCATION_NAME`
    """
    sip = SIP.get_sip(uuid)
    archiver = BaseArchiver(sip)
    archiver.write_all_files()
    return 0
def test_name_formatters(db, app, sips, sip_metadata_types, locations,
                         archive_fs, secure_sipfile_name_formatter,
                         custom_sipmetadata_name_formatter):
    """Test archiving with custom filename formatter."""
    sip = sips[3]  # SIP with some naughty filenames
    archiver = BaseArchiver(sip, filenames_mapping_file='files/filenames.txt')
    assert not archive_fs.listdir()
    archiver.write_all_files()
    assert len(archive_fs.listdir()) == 1
    fs = archive_fs.opendir(archiver.get_archive_subpath())
    assert set(fs.listdir()) == set(['metadata', 'files'])
    assert len(fs.listdir('metadata')) == 2
    # inside 'files/' there should be 'filenames.txt' file with the mappings
    assert len(fs.listdir('files')) == 4
    uuid1 = next(f.file.id for f in sip.files if f.filepath.endswith('txt'))
    uuid2 = next(f.file.id for f in sip.files if f.filepath.endswith('js'))
    uuid3 = next(f.file.id for f in sip.files if f.filepath.endswith('dat'))
    expected = [('metadata/marcxml-test-metadata.xml', '<p>XML 4 żółć</p>'),
                ('metadata/json-test-metadata.json',
                 '{"title": "JSON 4 żółć"}'),
                ('files/{0}-foobar.txt'.format(uuid1), 'test-fourth żółć'),
                ('files/{0}-http_maliciouswebsite.com_hack.js'.format(uuid2),
                 'test-fifth ąęćźə'),
                ('files/{0}-ozzcae.dat'.format(uuid3), 'test-sixth π'),
                ('files/filenames.txt',
                 set([
                     '{0}-foobar.txt ../../foobar.txt'.format(uuid1),
                     '{0}-http_maliciouswebsite.com_hack.js '
                     'http://maliciouswebsite.com/hack.js'.format(uuid2),
                     '{0}-ozzcae.dat łóżźćąę.dat'.format(uuid3),
                 ]))]
    for fn, content in expected:
        with fs.open(fn, 'r') as fp:
            if isinstance(content, set):  # Compare as set of lines
                c = set(fp.read().splitlines())
            else:
                c = fp.read()
        assert c == content
def test_name_formatters(db, app, sips, sip_metadata_types, locations,
                         archive_fs, secure_sipfile_name_formatter,
                         custom_sipmetadata_name_formatter):
    """Test archiving with custom filename formatter."""
    sip = sips[3]  # SIP with some naughty filenames
    archiver = BaseArchiver(sip, filenames_mapping_file='files/filenames.txt')
    assert not archive_fs.listdir()
    archiver.write_all_files()
    assert len(archive_fs.listdir()) == 1
    fs = archive_fs.opendir(archiver.get_archive_subpath())
    assert set(fs.listdir()) == set(['metadata', 'files'])
    assert len(fs.listdir('metadata')) == 2
    # inside 'files/' there should be 'filenames.txt' file with the mappings
    assert len(fs.listdir('files')) == 4
    uuid1 = next(f.file.id for f in sip.files if f.filepath.endswith('txt'))
    uuid2 = next(f.file.id for f in sip.files if f.filepath.endswith('js'))
    uuid3 = next(f.file.id for f in sip.files if f.filepath.endswith('dat'))
    expected = [
            ('metadata/marcxml-test-metadata.xml', '<p>XML 4 żółć</p>'),
            ('metadata/json-test-metadata.json', '{"title": "JSON 4 żółć"}'),
            ('files/{0}-foobar.txt'.format(uuid1), 'test-fourth żółć'),
            ('files/{0}-http_maliciouswebsite.com_hack.js'.format(uuid2),
             'test-fifth ąęćźə'),
            ('files/{0}-ozzcae.dat'.format(uuid3), 'test-sixth π'),
            ('files/filenames.txt',
             set(['{0}-foobar.txt ../../foobar.txt'.format(uuid1),
                  '{0}-http_maliciouswebsite.com_hack.js '
                  'http://maliciouswebsite.com/hack.js'.format(uuid2),
                  '{0}-ozzcae.dat łóżźćąę.dat'.format(uuid3), ]))

    ]
    for fn, content in expected:
        with fs.open(fn, 'r') as fp:
            if isinstance(content, set):  # Compare as set of lines
                c = set(fp.read().splitlines())
            else:
                c = fp.read()
        assert c == content
def test_getters(db, sips, sip_metadata_types, locations):
    """Test the constructor and the getters."""
    sip = sips[0]
    archiver = BaseArchiver(sip)
    assert archiver.get_archive_base_uri() == locations['archive'].uri
    assert archiver.sip is sip
    # get data files
    data_files_info = archiver._get_data_files()
    sip_id = str(sip.id)
    abs_path_fmt = "{root}/{c1}/{c2}/{cn}/".format(
        root=locations['archive'].uri,
        c1=sip_id[:2],
        c2=sip_id[2:4],
        cn=sip_id[4:]) + "{filepath}"
    abs_path = abs_path_fmt.format(filepath="files/foobar.txt")
    fi = {
        'file_uuid': str(sip.files[0].file_id),
        'filepath': 'files/foobar.txt',
        'filename': 'foobar.txt',
        'sipfilepath': 'foobar.txt',
        'size': 4,
        'fullpath': abs_path,
        'checksum': 'md5:098f6bcd4621d373cade4e832627b4f6'
    }
    assert data_files_info == [
        fi,
    ]

    metafiles_info = archiver._get_metadata_files()
    assert len(metafiles_info) == 2
    m1_abs_path = abs_path_fmt.format(filepath="metadata/json-test.json")
    m2_abs_path = abs_path_fmt.format(filepath="metadata/marcxml-test.xml")
    m1 = {
        'checksum': 'md5:da4ab7e4c4b762d8e2f3ec3b9f801b1f',
        'fullpath': m1_abs_path,
        'metadata_id': sip_metadata_types['json-test'].id,
        'filepath': 'metadata/json-test.json',
        'size': 19
    }
    m2 = {
        'checksum': 'md5:498d1ce86c2e9b9eb85f1e8105affdf6',
        'fullpath': m2_abs_path,
        'metadata_id': sip_metadata_types['marcxml-test'].id,
        'filepath': 'metadata/marcxml-test.xml',
        'size': 12
    }
    assert m1 in metafiles_info
    assert m2 in metafiles_info

    all_files_info = archiver.get_all_files()
    assert len(all_files_info) == 3
    assert fi in all_files_info
    assert m1 in all_files_info
    assert m2 in all_files_info
def test_write(db, sips, sip_metadata_types, locations, archive_fs):
    """Test writing of the SIPFiles and SIPMetadata files to archive."""
    sip = sips[0]
    archiver = BaseArchiver(sip)
    data_files_info = archiver._get_data_files()
    assert not archive_fs.listdir()  # Empty archive
    archiver._write_sipfile(data_files_info[0])
    assert len(archive_fs.listdir()) == 1
    fs = archive_fs.opendir(archiver.get_archive_subpath())
    assert fs.isfile('files/foobar.txt')

    assert not fs.isfile('metadata/json-test.json')
    assert not fs.isfile('metadata/marcxml-test.xml')
    metadata_files_info = archiver._get_metadata_files()
    archiver._write_sipmetadata(metadata_files_info[0])
    archiver._write_sipmetadata(metadata_files_info[1])
    assert fs.isfile('metadata/json-test.json')
    assert fs.isfile('metadata/marcxml-test.xml')

    assert not fs.isfile('test.txt')
    archiver._write_extra(content='test raw content', filename='test.txt')
    assert fs.isfile('test.txt')
    with fs.open('test.txt', 'r') as fp:
        cnt = fp.read()
    assert cnt == 'test raw content'

    assert not fs.isfile('test2.txt')
    extra_file_info = dict(
        checksum=('md5:' + str(md5('test'.encode('utf-8')).hexdigest())),
        size=len('test'),
        filepath='test2.txt',
        fullpath=fs.getsyspath('test2.txt'),
        content='test')
    archiver._write_extra(fileinfo=extra_file_info)
    assert fs.isfile('test.txt')
    with fs.open('test.txt', 'r') as fp:
        cnt = fp.read()
    assert cnt == 'test raw content'
def test_write(db, sips, sip_metadata_types, locations, archive_fs):
    """Test writing of the SIPFiles and SIPMetadata files to archive."""
    sip = sips[0]
    archiver = BaseArchiver(sip)
    data_files_info = archiver._get_data_files()
    assert not archive_fs.listdir()  # Empty archive
    archiver._write_sipfile(data_files_info[0])
    assert len(archive_fs.listdir()) == 1
    fs = archive_fs.opendir(archiver.get_archive_subpath())
    assert fs.isfile('files/foobar.txt')

    assert not fs.isfile('metadata/json-test.json')
    assert not fs.isfile('metadata/marcxml-test.xml')
    metadata_files_info = archiver._get_metadata_files()
    archiver._write_sipmetadata(metadata_files_info[0])
    archiver._write_sipmetadata(metadata_files_info[1])
    assert fs.isfile('metadata/json-test.json')
    assert fs.isfile('metadata/marcxml-test.xml')

    assert not fs.isfile('test.txt')
    archiver._write_extra(content='test raw content', filename='test.txt')
    assert fs.isfile('test.txt')
    with fs.open('test.txt', 'r') as fp:
        cnt = fp.read()
    assert cnt == 'test raw content'

    assert not fs.isfile('test2.txt')
    extra_file_info = dict(
        checksum=('md5:' + str(md5('test'.encode('utf-8')).hexdigest())),
        size=len('test'),
        filepath='test2.txt',
        fullpath=fs.getsyspath('test2.txt'),
        content='test'
    )
    archiver._write_extra(fileinfo=extra_file_info)
    assert fs.isfile('test.txt')
    with fs.open('test.txt', 'r') as fp:
        cnt = fp.read()
    assert cnt == 'test raw content'