def test_fileinstance_copy_contents(app, db, dummy_location):
    """Test copy contents."""
    counter = dict(called=False)

    def callback(total, size):
        counter['called'] = True

    # Create source and set data.
    data = b('this is some data')
    src = FileInstance.create()
    src.set_contents(BytesIO(data), default_location=dummy_location.uri)
    db.session.commit()

    # Create destination - and use it to copy_contents from another object.
    dst = FileInstance.create()
    assert dst.size == 0
    assert dst.uri is None
    db.session.commit()

    # Copy contents
    dst.copy_contents(
        src, progress_callback=callback, default_location=dummy_location.uri)
    db.session.commit()
    assert dst.size == src.size
    assert dst.checksum == src.checksum
    assert dst.uri != src.uri
    assert counter['called']

    # Read data
    fp = dst.storage().open()
    assert data == fp.read()
    fp.close()
def test_fileinstance_copy_contents(app, db, dummy_location):
    """Test copy contents."""
    counter = dict(called=False)

    def callback(total, size):
        counter['called'] = True

    # Create source and set data.
    data = b('this is some data')
    src = FileInstance.create()
    src.set_contents(BytesIO(data), location=dummy_location)
    db.session.commit()

    # Create destination - and use it to copy_contents from another object.
    dst = FileInstance.create()
    assert dst.size == 0
    assert dst.uri is None
    db.session.commit()

    # Copy contents
    dst.copy_contents(src, progress_callback=callback, location=dummy_location)
    db.session.commit()
    assert dst.size == src.size
    assert dst.checksum == src.checksum
    assert dst.uri != src.uri
    assert counter['called']

    # Read data
    fp = dst.storage().open()
    assert data == fp.read()
    fp.close()
def test_object_restore(app, db, dummy_location):
    """Restore object."""
    f1 = FileInstance(uri="f1", size=1, checksum="mychecksum")
    f2 = FileInstance(uri="f2", size=2, checksum="mychecksum2")
    db.session.add(f1)
    db.session.add(f2)
    b1 = Bucket.create()

    obj1 = ObjectVersion.create(b1, "test").set_file(f1)
    ObjectVersion.create(b1, "test").set_file(f2)
    obj_deleted = ObjectVersion.delete(b1, "test")
    db.session.commit()

    assert ObjectVersion.query.count() == 3
    # Cannot restore a deleted version.
    pytest.raises(InvalidOperationError, obj_deleted.restore)

    # Restore first version
    obj_new = obj1.restore()
    db.session.commit()

    assert ObjectVersion.query.count() == 4
    assert obj_new.is_head is True
    assert obj_new.version_id != obj1.version_id
    assert obj_new.key == obj1.key
    assert obj_new.file_id == obj1.file_id
    assert obj_new.bucket == obj1.bucket
def test_fileinstance_get(app, db, dummy_location):
    """Test fileinstance get."""
    f = FileInstance.create()
    db.session.commit()
    # Get existing file.
    assert FileInstance.get(f.id) is not None
    # Non-existing files returns none
    assert FileInstance.get(uuid.uuid4()) is None
def test_fileinstance_get(app, db, dummy_location):
    """Test fileinstance get."""
    f = FileInstance.create()
    db.session.commit()
    # Get existing file.
    assert FileInstance.get(f.id) is not None
    # Non-existing files returns none
    assert FileInstance.get(uuid.uuid4()) is None
def test_fileinstance_get_by_uri(app, db, dummy_location):
    """Test file get by uri."""
    f = FileInstance.create()
    f.uri = "LICENSE"
    db.session.commit()

    assert FileInstance.get_by_uri("LICENSE") is not None
    FileInstance.get_by_uri("NOTVALID") is None
    pytest.raises(AssertionError, FileInstance.get_by_uri, None)
def test_fileinstance_get_by_uri(app, db, dummy_location):
    """Test file get by uri."""
    f = FileInstance.create()
    f.uri = "LICENSE"
    db.session.commit()

    assert FileInstance.get_by_uri("LICENSE") is not None
    FileInstance.get_by_uri("NOTVALID") is None
    pytest.raises(AssertionError, FileInstance.get_by_uri, None)
Beispiel #8
0
def create_b2safe_file(external_pids, bucket):
    """Create a FileInstance which contains a PID in its uri."""
    validate_schema(
        external_pids, {
            'type': 'array',
            'items': {
                'type': 'object',
                'properties': {
                    'ePIC_PID': {
                        'type': 'string'
                    },
                    'key': {
                        'type': 'string'
                    }
                },
                'additionalProperties': False,
                'required': ['ePIC_PID', 'key']
            }
        })

    keys_list = [e['key'] for e in external_pids]
    keys_set = set(keys_list)
    if len(keys_list) != len(keys_set):
        raise InvalidDepositError([
            FieldError('external_pids',
                       'Field external_pids contains duplicate keys.')
        ])
    for external_pid in external_pids:
        if not external_pid['ePIC_PID'].startswith('http://hdl.handle.net/'):
            external_pid['ePIC_PID'] = 'http://hdl.handle.net/' + \
                external_pid['ePIC_PID']
        if external_pid['key'].startswith('/'):
            raise InvalidDepositError([
                FieldError('external_pids',
                           'File key cannot start with a "/".')
            ])
        try:
            # Create the file instance if it does not already exist
            file_instance = FileInstance.get_by_uri(external_pid['ePIC_PID'])
            if file_instance is None:
                file_instance = FileInstance.create()
                file_instance.set_uri(external_pid['ePIC_PID'],
                                      1,
                                      0,
                                      storage_class='B')
            assert file_instance.storage_class == 'B'
            # Add the file to the bucket if it is not already in it
            current_version = ObjectVersion.get(bucket, external_pid['key'])
            if not current_version or \
                    current_version.file_id != file_instance.id:
                ObjectVersion.create(bucket, external_pid['key'],
                                     file_instance.id)
        except IntegrityError as e:
            raise InvalidDepositError(
                [FieldError('external_pids', 'File URI already exists.')])
Beispiel #9
0
 def _get_frames(cls, master_video):
     """Get Frames."""
     return [
         FileInstance.get(f['file_id']).uri
         for f in CDSVideosFilesIterator.get_video_frames(
             master_file=master_video)
     ]
Beispiel #10
0
    def upload(self, pid=None, *args, **kwargs):
        """Upload action for file/repository."""
        with UpdateDepositPermission(self).require(403):
            data = request.get_json()

            fileinfo = self._construct_fileinfo(data['url'], data['type'])
            if request:
                _, record = request.view_args.get('pid_value').data
                record_id = str(record.id)
                filename = fileinfo['filename']
                obj = ObjectVersion.create(bucket=record.files.bucket,
                                           key=filename)
                obj.file = FileInstance.create()
                record.files.flush()
                record.files[filename]['source_url'] = data['url']

                if data['type'] == 'url':
                    if data['url'].startswith(
                        ('https://github', 'https://gitlab.cern.ch',
                         'root://')):
                        download_url.delay(record_id, data['url'], fileinfo)
                    else:
                        raise FileUploadError(
                            'Please provide a valid file url.')
                else:
                    if data['url'].startswith(
                        ('https://github', 'https://gitlab.cern.ch')):
                        download_repo.delay(record_id, data['url'], filename)
                    else:
                        raise FileUploadError(
                            'Please provide a valid repository url.')

            return self
Beispiel #11
0
def handle_record_files(data, bucket, files, skip_files):
    """Handles record files."""
    for file in files:
        if skip_files:
            break
        assert 'uri' in file
        assert 'size' in file
        assert 'checksum' in file

        try:
            f = FileInstance.create()
            filename = file.get("uri").split('/')[-1:][0]
            f.set_uri(file.get("uri"), file.get("size"), file.get("checksum"))
            obj = ObjectVersion.create(bucket, filename, _file_id=f.id)

            file.update({
                'bucket': str(obj.bucket_id),
                'checksum': obj.file.checksum,
                'key': obj.key,
                'version_id': str(obj.version_id),
            })

        except Exception as e:
            click.echo('Recid {0} file {1} could not be loaded due '
                       'to {2}.'.format(data.get('recid'), filename, str(e)))
            continue
Beispiel #12
0
    def save_file(self, content, filename, size, failed=False):
        """Save file with given content in deposit bucket.

           If downloading a content failed, file will be still created,
           with tag `failed`.

           :param content: stream
           :param filename: name that file will be saved with
           :param size: size of content
           :param failed: if failed during downloading the content
        """
        obj = ObjectVersion.create(bucket=self.files.bucket, key=filename)
        obj.file = FileInstance.create()
        self.files.flush()

        if not failed:
            self.files[filename].file.set_contents(
                content,
                default_location=self.files.bucket.location.uri,
                size=size)

            print('File {} saved ({}b).\n'.format(filename, size))
        else:
            ObjectVersionTag.create(object_version=obj,
                                    key='status',
                                    value='failed')
            print('File {} not saved.\n'.format(filename))

        self.files.flush()
        db.session.commit()

        return obj
def test_b2share_storage_with_pid(base_app, app, tmp_location, login_user, test_users):
    """Check that the storage class will redirect pid files."""
    pid = 'http://hdl.handle.net/11304/74c66f0b-f814-4202-9dcb-4889ba9b1047'
    with app.app_context():
        # Disable access control for this test
        tmp_location = Location.query.first()
        with db.session.begin_nested():
            bucket = Bucket.create(tmp_location, storage_class='B')
            pid_file = FileInstance.create()
            pid_file.set_uri(pid, 1, 0, storage_class='B')
            ObjectVersion.create(bucket, 'test.txt', pid_file.id)

        db.session.commit()
        url = url_for('invenio_files_rest.object_api',
                        bucket_id=bucket.id,
                        key='test.txt')
    try:
        with app.app_context():
            permission = current_files_rest.permission_factory
            current_files_rest.permission_factory = allow_all
        # Check that accessing the file redirects to the PID
        with app.test_client() as client:
            resp = client.get(url)
            assert resp.headers['Location'] == pid
            assert resp.status_code == 302
    finally:
        with app.app_context():
            current_files_rest.permission_factory = permission
def test_b2share_storage_with_pid(base_app, app, tmp_location, login_user,
                                  test_users):
    """Check that the storage class will redirect pid files."""
    pid = 'http://hdl.handle.net/11304/74c66f0b-f814-4202-9dcb-4889ba9b1047'
    with app.app_context():
        # Disable access control for this test
        tmp_location = Location.query.first()
        with db.session.begin_nested():
            bucket = Bucket.create(tmp_location, storage_class='B')
            pid_file = FileInstance.create()
            pid_file.set_uri(pid, 1, 0, storage_class='B')
            ObjectVersion.create(bucket, 'test.txt', pid_file.id)

        db.session.commit()
        url = url_for('invenio_files_rest.object_api',
                      bucket_id=bucket.id,
                      key='test.txt')
    try:
        with app.app_context():
            permission = current_files_rest.permission_factory
            current_files_rest.permission_factory = allow_all
        # Check that accessing the file redirects to the PID
        with app.test_client() as client:
            resp = client.get(url)
            assert resp.headers['Location'] == pid
            assert resp.status_code == 302
    finally:
        with app.app_context():
            current_files_rest.permission_factory = permission
Beispiel #15
0
    def delete_record(self, fileinstance_id, record_uuid):
        """Delete a record.

        :param fileinstance_id: The file instance id.
        :param record_uuid: The record's uuid.
        """
        # get the FileInstance object
        file_instance = FileInstance.get(fileinstance_id)
        # get the uri of the file for the directory of the folder
        uri = file_instance.uri
        # building the path to delete by storing the index of the folder data
        i = uri.find('data')

        # removing the record indexing, the record and the file instance
        recind = RecordIndexer()
        recind.delete_by_id(record_uuid=record_uuid)
        self.delete_bucket()
        FileInstance.query.filter_by(id=fileinstance_id).delete()
        PersistentIdentifier.query.filter_by(object_uuid=record_uuid).delete()
        db.session.commit()

        # removing the file on disk and the folder containing it
        # the full path is /home/<user>/.local/share/virtualenvs/
        # fare-platform-<code>/var/instance/data/<f1>/<f2>/<bucketid>/<filename>
        # after have stored the index of the folder "data", where there are all
        # the records, the path is passed to the function below
        # and trimmed at <f1>, a folder name composed by 2 character,
        # at the index "i" is added 8 because is the number of
        # character for completing the path, terminating at "<f1>/"
        shutil.rmtree(uri[:i + 8])

        current_app.logger.info("Deleted file= " + self['title'] +
                                ", by user= " + current_user.email)
def test_pyfilesystemstorage(app, db, dummy_location):
    """Test pyfs storage."""
    # Create bucket and object
    with db.session.begin_nested():
        b1 = Bucket.create()
        obj = ObjectVersion.create(b1, "LICENSE")
        obj.file = FileInstance.create()

    storage = PyFilesystemStorage(obj.file, base_uri=obj.bucket.location.uri)
    counter = dict(size=0)

    def callback(total, size):
        counter['size'] = size

    data = b("this is some content")
    stream = BytesIO(data)
    loc, size, checksum = storage.save(stream, progress_callback=callback)

    # Verify checksum, size and location.
    m = hashlib.md5()
    m.update(data)
    assert "md5:{0}".format(m.hexdigest()) == checksum

    assert size == len(data)
    assert loc == join(
        dummy_location.uri,
        str(obj.file.id),
        "data")
def test_pyfilesystemstorage_make_path():
    """Test path for files."""
    fi = FileInstance.create()
    fi.id = uuid.uuid5(uuid.NAMESPACE_DNS, 'Testing-')
    fs = PyFilesystemStorage(fi, base_uri='Base')
    assert 'Base/45/629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path()
    assert 'Base/4/5629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(1, 1)
    assert 'Base/4/5/6/29316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(3, 1)
    assert 'Base/456/29316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(1, 3)

    # If length 0, it should take the default value.
    assert 'Base/45/629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(1, 0)

    # If dimensions are 0, it should take the default value.
    assert 'Base/4/5629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(0, 1)

    # Length of each partition is too long.
    with pytest.raises(AssertionError):
        fs.make_path(1, 50)

    # Number of partitions is too high.
    with pytest.raises(AssertionError):
        fs.make_path(50, 1)

    # Both values produce the exception.
    with pytest.raises(AssertionError):
        fs.make_path(50, 50)
Beispiel #18
0
def handle_record_files(data, bucket, files, skip_files):
    """Handles record files."""
    for file in files:
        if skip_files:
            break
        assert 'uri' in file
        assert 'size' in file
        assert 'checksum' in file

        try:
            f = FileInstance.create()
            filename = file.get("uri").split('/')[-1:][0]
            f.set_uri(file.get("uri"), file.get(
                "size"), file.get("checksum"))
            obj = ObjectVersion.create(
                bucket,
                filename,
                _file_id=f.id
            )

            file.update({
                'bucket': str(obj.bucket_id),
                'checksum': obj.file.checksum,
                'key': obj.key,
                'version_id': str(obj.version_id),
            })

        except Exception as e:
            click.echo(
                'Recid {0} file {1} could not be loaded due '
                'to {2}.'.format(data.get('recid'), filename,
                                 str(e)))
            continue
def test_fileinstance_set_contents(app, db, dummy_location):
    """Test file instance create."""
    counter = dict(called=False)

    def callback(total, size):
        counter['called'] = True

    f = FileInstance.create()
    db.session.commit()
    assert f.readable is False
    assert f.writable is True
    data = BytesIO(b("test file instance set contents"))
    f.set_contents(
        data, default_location=dummy_location.uri, progress_callback=callback)
    db.session.commit()
    assert f.readable is True
    assert f.writable is False
    assert counter['called']

    pytest.raises(
        ValueError,
        f.set_contents,
        BytesIO(b("different content")),
        location=dummy_location,
    )
def test_object_relink_all(app, db, dummy_location):
    """Test relinking files."""
    b1 = Bucket.create()
    obj1 = ObjectVersion.create(
        b1, "relink-test", stream=BytesIO(b('relinkthis')))
    ObjectVersion.create(b1, "do-not-touch", stream=BytesIO(b('na')))
    b1.snapshot()
    db.session.commit()

    assert ObjectVersion.query.count() == 4
    assert FileInstance.query.count() == 2

    fnew = FileInstance.create()
    fnew.copy_contents(obj1.file, default_location=b1.location.uri)
    db.session.commit()

    fold = obj1.file

    assert ObjectVersion.query.filter_by(file_id=fold.id).count() == 2
    assert ObjectVersion.query.filter_by(file_id=fnew.id).count() == 0

    ObjectVersion.relink_all(obj1.file, fnew)
    db.session.commit()

    assert ObjectVersion.query.filter_by(file_id=fold.id).count() == 0
    assert ObjectVersion.query.filter_by(file_id=fnew.id).count() == 2
def test_object_relink_all(app, db, dummy_location):
    """Test relinking files."""
    b1 = Bucket.create()
    obj1 = ObjectVersion.create(
        b1, "relink-test", stream=BytesIO(b('relinkthis')))
    ObjectVersion.create(b1, "do-not-touch", stream=BytesIO(b('na')))
    b1.snapshot()
    db.session.commit()

    assert ObjectVersion.query.count() == 4
    assert FileInstance.query.count() == 2

    fnew = FileInstance.create()
    fnew.copy_contents(obj1.file, location=b1.location)
    db.session.commit()

    fold = obj1.file

    assert ObjectVersion.query.filter_by(file_id=fold.id).count() == 2
    assert ObjectVersion.query.filter_by(file_id=fnew.id).count() == 0

    ObjectVersion.relink_all(obj1.file, fnew)
    db.session.commit()

    assert ObjectVersion.query.filter_by(file_id=fold.id).count() == 0
    assert ObjectVersion.query.filter_by(file_id=fnew.id).count() == 2
Beispiel #22
0
def datasets(skip_files):
    """Load demo datasets records."""
    from invenio_db import db
    from invenio_records_files.api import Record
    from invenio_indexer.api import RecordIndexer
    from cernopendata.modules.records.minters.recid import \
        cernopendata_recid_minter
    from cernopendata.modules.records.minters.datasetid import \
        cernopendata_datasetid_minter

    from invenio_files_rest.models import \
        Bucket, FileInstance, ObjectVersion
    from invenio_records_files.models import RecordsBuckets

    indexer = RecordIndexer()
    schema = current_app.extensions['invenio-jsonschemas'].path_to_url(
        'records/datasets-v1.0.0.json')
    data = pkg_resources.resource_filename('cernopendata',
                                           'modules/fixtures/data/datasets')
    datasets_json = glob.glob(os.path.join(data, '*.json'))

    # FIXME: change the treatment of `files` according to `records` fixtures.
    for filename in datasets_json:

        click.echo('Loading datasets from {0} ...'.format(filename))

        with open(filename, 'rb') as source:
            for data in json.load(source):
                files = data.pop('files', [])

                id = uuid.uuid4()
                # (TOFIX) Remove if statement in production
                # as every dataset record should have a doi
                if data.get('doi', None):
                    cernopendata_datasetid_minter(id, data)
                else:
                    cernopendata_recid_minter(id, data)
                data['$schema'] = schema
                record = Record.create(data, id_=id)

                bucket = Bucket.create()
                RecordsBuckets.create(record=record.model, bucket=bucket)

                for file in files:
                    if skip_files:
                        break
                    assert 'uri' in file
                    assert 'size' in file
                    assert 'checksum' in file

                    f = FileInstance.create()
                    filename = file.get("uri").split('/')[-1:][0]
                    f.set_uri(file.get("uri"), file.get("size"),
                              file.get("checksum"))

                    ObjectVersion.create(bucket, filename, _file_id=f.id)
                db.session.commit()
                indexer.index(record)
                db.session.expunge_all()
def test_fileinstance_create(app, db, dummy_location):
    """Test file instance create."""
    f = FileInstance.create()
    assert f.id
    assert f.readable is False
    assert f.writable is True
    assert f.uri is None
    assert f.size == 0
    assert f.checksum is None
    assert f.last_check_at is None
    assert f.last_check is None
    db.session.commit()

    # Check unique constraint on URI with none values.
    f = FileInstance.create()
    f = FileInstance.create()
    db.session.commit()
def test_fileinstance_create(app, db, dummy_location):
    """Test file instance create."""
    f = FileInstance.create()
    assert f.id
    assert f.readable is False
    assert f.writable is True
    assert f.uri is None
    assert f.size == 0
    assert f.checksum is None
    assert f.last_check_at is None
    assert f.last_check is None
    db.session.commit()

    # Check unique constraint on URI with none values.
    f = FileInstance.create()
    f = FileInstance.create()
    db.session.commit()
Beispiel #25
0
def data_policies(skip_files):
    """Load demo Data Policy records."""
    from invenio_db import db
    from invenio_indexer.api import RecordIndexer
    from cernopendata.modules.records.minters.recid import \
        cernopendata_recid_minter

    from invenio_files_rest.models import \
        Bucket, FileInstance, ObjectVersion
    from invenio_records_files.models import RecordsBuckets
    from invenio_records_files.api import Record

    from invenio_records.models import RecordMetadata

    indexer = RecordIndexer()
    schema = current_app.extensions['invenio-jsonschemas'].path_to_url(
        'records/data-policies-v1.0.0.json'
    )
    data = pkg_resources.resource_filename('cernopendata',
                                           'modules/fixtures/data')
    data_policies_json = glob.glob(os.path.join(data, '*.json'))

    for filename in data_policies_json:

        click.echo('Loading data-policies from {0} ...'.format(filename))

        with open(filename, 'rb') as source:
            for data in json.load(source):
                files = data.pop('files', [])

                id = uuid.uuid4()
                cernopendata_recid_minter(id, data)
                data['$schema'] = schema
                record = Record.create(data, id_=id)

                bucket = Bucket.create()
                RecordsBuckets.create(
                    record=record.model, bucket=bucket)

                for file in files:
                    if skip_files:
                        break
                    assert 'uri' in file
                    assert 'size' in file
                    assert 'checksum' in file

                    f = FileInstance.create()
                    filename = file.get("uri").split('/')[-1:][0]
                    f.set_uri(file.get("uri"), file.get(
                        "size"), file.get("checksum"))
                    ObjectVersion.create(
                        bucket,
                        filename,
                        _file_id=f.id
                    )
                db.session.commit()
                indexer.index(record)
                db.session.expunge_all()
Beispiel #26
0
def data_policies(skip_files):
    """Load demo Data Policy records."""
    from invenio_db import db
    from invenio_indexer.api import RecordIndexer
    from cernopendata.modules.records.minters.recid import \
        cernopendata_recid_minter

    from invenio_files_rest.models import \
        Bucket, FileInstance, ObjectVersion
    from invenio_records_files.models import RecordsBuckets
    from invenio_records_files.api import Record

    from invenio_records.models import RecordMetadata

    indexer = RecordIndexer()
    schema = current_app.extensions['invenio-jsonschemas'].path_to_url(
        'records/data-policies-v1.0.0.json'
    )
    data = pkg_resources.resource_filename('cernopendata',
                                           'modules/fixtures/data')
    data_policies_json = glob.glob(os.path.join(data, '*.json'))

    for filename in data_policies_json:

        click.echo('Loading data-policies from {0} ...'.format(filename))

        with open(filename, 'rb') as source:
            for data in json.load(source):
                files = data.pop('files', [])

                id = uuid.uuid4()
                cernopendata_recid_minter(id, data)
                data['$schema'] = schema
                record = Record.create(data, id_=id)

                bucket = Bucket.create()
                RecordsBuckets.create(
                    record=record.model, bucket=bucket)

                for file in files:
                    if skip_files:
                        break
                    assert 'uri' in file
                    assert 'size' in file
                    assert 'checksum' in file

                    f = FileInstance.create()
                    filename = file.get("uri").split('/')[-1:][0]
                    f.set_uri(file.get("uri"), file.get(
                        "size"), file.get("checksum"))
                    ObjectVersion.create(
                        bucket,
                        filename,
                        _file_id=f.id
                    )
                db.session.commit()
                indexer.index(record)
                db.session.expunge_all()
Beispiel #27
0
def create_b2safe_file(external_pids, bucket):
    """Create a FileInstance which contains a PID in its uri."""
    validate_schema(external_pids, {
        'type': 'array',
        'items': {
            'type': 'object',
            'properties': {
                'ePIC_PID': {'type': 'string'},
                'key': {'type': 'string'}
            },
            'additionalProperties': False,
            'required': ['ePIC_PID', 'key']
        }
    })

    keys_list = [e['key'] for e in external_pids]
    keys_set = set(keys_list)
    if len(keys_list) != len(keys_set):
        raise InvalidDepositError([FieldError('external_pids',
            'Field external_pids contains duplicate keys.')])
    for external_pid in external_pids:
        if not external_pid['ePIC_PID'].startswith('http://hdl.handle.net/'):
            external_pid['ePIC_PID'] = 'http://hdl.handle.net/' + \
                external_pid['ePIC_PID']
        if external_pid['key'].startswith('/'):
            raise InvalidDepositError(
                [FieldError('external_pids',
                            'File key cannot start with a "/".')])
        try:
            # Create the file instance if it does not already exist
            file_instance = FileInstance.get_by_uri(external_pid['ePIC_PID'])
            if file_instance is None:
                file_instance = FileInstance.create()
                file_instance.set_uri(
                    external_pid['ePIC_PID'], 1, 0, storage_class='B')
            assert file_instance.storage_class == 'B'
            # Add the file to the bucket if it is not already in it
            current_version = ObjectVersion.get(bucket, external_pid['key'])
            if not current_version or \
                    current_version.file_id != file_instance.id:
                ObjectVersion.create(bucket, external_pid['key'],
                                     file_instance.id)
        except IntegrityError as e:
            raise InvalidDepositError(
                [FieldError('external_pids', 'File URI already exists.')])
def test_storage_interface():
    """Test storage interface."""
    f = FileInstance.create()
    s = Storage(f)

    pytest.raises(NotImplementedError, s.open)
    pytest.raises(NotImplementedError, s.send_file)
    pytest.raises(NotImplementedError, s.save, None)
    pytest.raises(NotImplementedError, s.compute_checksum, None)
def test_object_set_file(app, db, dummy_location):
    """Test object set file."""
    b = Bucket.create()
    f = FileInstance(uri="f1", size=1, checksum="mychecksum")
    obj = ObjectVersion.create(b, "test").set_file(f)
    db.session.commit()
    assert obj.file == f

    assert pytest.raises(FileInstanceAlreadySetError, obj.set_file, f)
def test_pyfs_send_file_fail(app, db, dummy_location):
    """Test send file."""
    f = FileInstance.create()
    f.set_contents(BytesIO(b("test")), location=dummy_location)

    with patch('invenio_files_rest.storage.send_stream') as send_stream:
        send_stream.side_effect = OSError(errno.EPERM, "Permission problem")
        with app.test_request_context():
            pytest.raises(StorageError, f.send_file)
def test_sip_file_model(db):
    """Test the SIPFile model."""
    sip1 = SIP.create('json', '{}')
    file1 = FileInstance.create()
    sipfile1 = SIPFile(sip_id=sip1.id, filepath="foobar.zip", file_id=file1.id)

    db.session.add(sipfile1)
    db.session.commit()
    assert SIP.query.count() == 1
    assert SIPFile.query.count() == 1
def test_publish_process_files(api_app, db, location):
    """Test _process_files changing master tags on bucket snapshots."""
    deposit = CDSDeposit.create(
        dict(
            date='1/2/3',
            category='cat',
            type='type',
            title=dict(title='title'),
            report_number=['1234'],
            videos=[]),
        bucket_location='videos')

    # deposit has no files, so _process_files must yield None
    with deposit._process_files(None, dict()) as data:
        assert data is None
    bucket = deposit.files.bucket
    master_obj = ObjectVersion.create(
        bucket=bucket,
        key='master',
        _file_id=FileInstance.create())
    number_of_slaves = 10
    for i in range(number_of_slaves):
        slave_obj = ObjectVersion.create(
            bucket=bucket,
            key='slave{}.mp4'.format(i + 1),
            _file_id=FileInstance.create())
        ObjectVersionTag.create(slave_obj, 'master', master_obj.version_id)
        ObjectVersionTag.create(slave_obj, 'media_type', 'video')
        ObjectVersionTag.create(slave_obj, 'context_type', 'subformat')
    assert Bucket.query.count() == 1
    with deposit._process_files(None, dict()):
        # the snapshot bucket must have been created
        assert Bucket.query.count() == 2
        for bucket in Bucket.query.all():
            master_version = [str(obj.version_id) for obj in bucket.objects
                              if 'master' not in obj.get_tags()][0]
            # the master of each slave must be in the same bucket
            for obj in bucket.objects:
                if str(obj.version_id) != master_version:
                    assert obj.get_tags()['master'] == master_version
                    assert obj.get_tags()['media_type'] == 'video'
                    assert obj.get_tags()['context_type'] == 'subformat'
def test_fileinstance_copy_contents_invalid(app, db, dummy_location):
    """Test invalid copy contents."""
    # Source not readable
    src = FileInstance.create()
    dst = FileInstance.create()
    pytest.raises(ValueError, dst.copy_contents, src)

    # Create valid source
    data = b('this is some data')
    src = FileInstance.create()
    src.set_contents(BytesIO(data), location=dummy_location)
    db.session.commit()

    # Destination not writable
    dst.writable = False
    pytest.raises(ValueError, dst.copy_contents, src)
    # Size is not 0
    dst.writable = True
    dst.size = 1
    pytest.raises(ValueError, dst.copy_contents, src)
def test_fileinstance_copy_contents_invalid(app, db, dummy_location):
    """Test invalid copy contents."""
    # Source not readable
    src = FileInstance.create()
    dst = FileInstance.create()
    pytest.raises(ValueError, dst.copy_contents, src)

    # Create valid source
    data = b('this is some data')
    src = FileInstance.create()
    src.set_contents(BytesIO(data), default_location=dummy_location.uri)
    db.session.commit()

    # Destination not writable
    dst.writable = False
    pytest.raises(ValueError, dst.copy_contents, src)
    # Size is not 0
    dst.writable = True
    dst.size = 1
    pytest.raises(ValueError, dst.copy_contents, src)
def test_sip_file_model(db):
    """Test the SIPFile model."""
    sip1 = SIP.create('json', '{}')
    file1 = FileInstance.create()
    sipfile1 = SIPFile(sip_id=sip1.id, filepath="foobar.zip",
                       file_id=file1.id)

    db.session.add(sipfile1)
    db.session.commit()
    assert SIP.query.count() == 1
    assert SIPFile.query.count() == 1
def test_pyfilesystemstorage_checksum_fail(app, db, dummy_location):
    """Test fixity problems."""
    # Raise an error during checksum calculation
    def callback(total, size):
        raise OSError(errno.EPERM, "Permission")

    f = FileInstance.create()
    f.set_contents(BytesIO(b("test")), location=dummy_location)

    pytest.raises(
        StorageError, PyFilesystemStorage(f).compute_checksum,
        progress_callback=callback)
    def _verify_file_and_symlink(record, file_id, filename):
        # verify uploaded file exists
        uploaded_file = FileInstance.get(file_id)
        uploaded_file_path = uploaded_file.uri
        assert os.path.exists(uploaded_file_path)

        # verify symlink exists
        symlink_path = _get_symlink_path(record, filename)
        assert os.path.exists(symlink_path)

        # verify symlink points to the correct file
        assert os.path.realpath(symlink_path) == os.path.realpath(
            uploaded_file_path)
def test_object_version_tags(app, db, dummy_location):
    """Test object version tags."""
    f = FileInstance(uri="f1", size=1, checksum="mychecksum")
    db.session.add(f)
    db.session.commit()
    b = Bucket.create()
    obj1 = ObjectVersion.create(b, "test").set_file(f)
    ObjectVersionTag.create(obj1, "mykey", "testvalue")
    ObjectVersionTag.create(obj1, "another_key", "another value")
    db.session.commit()

    # Duplicate key
    pytest.raises(
        IntegrityError, ObjectVersionTag.create, obj1, "mykey", "newvalue")

    # Test get
    assert ObjectVersionTag.query.count() == 2
    assert ObjectVersionTag.get(obj1, "mykey").value == "testvalue"
    assert ObjectVersionTag.get_value(obj1.version_id, "another_key") \
        == "another value"
    assert ObjectVersionTag.get_value(obj1, "invalid") is None

    # Test delete
    ObjectVersionTag.delete(obj1, "mykey")
    assert ObjectVersionTag.query.count() == 1
    ObjectVersionTag.delete(obj1, "invalid")
    assert ObjectVersionTag.query.count() == 1

    # Create or update
    ObjectVersionTag.create_or_update(obj1, "another_key", "newval")
    ObjectVersionTag.create_or_update(obj1.version_id, "newkey", "testval")
    db.session.commit()
    assert ObjectVersionTag.get_value(obj1, "another_key") == "newval"
    assert ObjectVersionTag.get_value(obj1, "newkey") == "testval"

    # Get tags as dictionary
    assert obj1.get_tags() == dict(another_key="newval", newkey="testval")
    obj2 = ObjectVersion.create(b, 'test2')
    assert obj2.get_tags() == dict()

    # Copy object version
    obj_copy = obj1.copy()
    db.session.commit()
    assert obj_copy.get_tags() == dict(another_key="newval", newkey="testval")
    assert ObjectVersionTag.query.count() == 4

    # Cascade delete
    ObjectVersion.query.delete()
    db.session.commit()
    assert ObjectVersionTag.query.count() == 0
Beispiel #39
0
    def create_file(self, bucket, f):
        """Create a single file."""
        # Ensure that file instance get's created with the same ID as it is
        # used in the REST API.
        fileinstance = FileInstance(
            id=f['id'],
            writable=True,
            readable=False,
            size=0,
        )
        db.session.add(fileinstance)
        fileinstance.set_uri(f['uri'], f['size'], f['checksum'])

        obj = ObjectVersion.create(bucket, f['key']).set_file(fileinstance)

        return (dict(
            bucket=str(obj.bucket.id),
            key=obj.key,
            checksum=obj.file.checksum,
            size=obj.file.size,
            version_id=str(obj.version_id),
            type=f['type'],
        ), fileinstance)
Beispiel #40
0
    def create_file(self, bucket, f):
        """Create a single file."""
        # Ensure that file instance get's created with the same ID as it is
        # used in the REST API.
        fileinstance = FileInstance(
            id=f['id'],
            writable=True,
            readable=False,
            size=0,
        )
        db.session.add(fileinstance)
        fileinstance.set_uri(f['uri'], f['size'], f['checksum'])

        obj = ObjectVersion.create(bucket, f['key']).set_file(fileinstance)

        return (dict(
            bucket=str(obj.bucket.id),
            key=obj.key,
            checksum=obj.file.checksum,
            size=obj.file.size,
            version_id=str(obj.version_id),
            type=f['type'],
        ), fileinstance)
def test_fileinstance_send_file(app, db, dummy_location):
    """Test file instance send file."""
    f = FileInstance.create()
    # File not readable
    pytest.raises(FileInstanceUnreadableError, f.send_file)

    # Write data
    data = b("test file instance set contents")
    f.set_contents(BytesIO(data), default_location=dummy_location.uri)
    db.session.commit()

    # Send data
    with app.test_request_context():
        res = f.send_file('test.txt')
        assert int(res.headers['Content-Length']) == len(data)
def test_fileinstance_send_file(app, db, dummy_location):
    """Test file instance send file."""
    f = FileInstance.create()
    # File not readable
    pytest.raises(ValueError, f.send_file)

    # Write data
    data = b("test file instance set contents")
    f.set_contents(BytesIO(data), location=dummy_location)
    db.session.commit()

    # Send data
    with app.test_request_context():
        res = f.send_file()
        assert int(res.headers['Content-Length']) == len(data)
Beispiel #43
0
def software(skip_files):
    """Load demo software records."""
    from invenio_db import db
    from invenio_records_files.api import Record
    from invenio_indexer.api import RecordIndexer
    from cernopendata.modules.records.minters.softid import \
        cernopendata_softid_minter

    from invenio_files_rest.models import \
        Bucket, FileInstance, ObjectVersion
    from invenio_records_files.models import RecordsBuckets

    indexer = RecordIndexer()
    schema = current_app.extensions['invenio-jsonschemas'].path_to_url(
        'records/software-v1.0.0.json')
    data = pkg_resources.resource_filename('cernopendata',
                                           'modules/fixtures/data/software')
    software_json = glob.glob(os.path.join(data, '*.json'))

    # FIXME: change the treatment of `files` according to `records` fixtures.
    for filename in software_json:
        with open(filename, 'rb') as source:
            for data in json.load(source):
                files = data.pop('files', [])

                id = uuid.uuid4()
                cernopendata_softid_minter(id, data)
                record = Record.create(data, id_=id)
                record['$schema'] = schema
                bucket = Bucket.create()
                RecordsBuckets.create(record=record.model, bucket=bucket)

                for file in files:
                    if skip_files:
                        break
                    assert 'uri' in file
                    assert 'size' in file
                    assert 'checksum' in file

                    f = FileInstance.create()
                    filename = file.get("uri").split('/')[-1:][0]
                    f.set_uri(file.get("uri"), file.get("size"),
                              file.get("checksum"))
                    ObjectVersion.create(bucket, filename, _file_id=f.id)
                db.session.commit()
                indexer.index(record)
                db.session.expunge_all()
    def create_file(self, bucket, key, file_versions):
        """Create a single file with all versions."""
        objs = []
        for file_ver in file_versions:
            f = FileInstance.create().set_uri(
                file_ver['full_path'],
                file_ver['size'],
                'md5:{0}'.format(file_ver['checksum']),
            )
            obj = ObjectVersion.create(bucket, key).set_file(f)
            obj.created = arrow.get(
                file_ver['creation_date']).datetime.replace(tzinfo=None)
            objs.append(obj)

        # Set head version
        db.session.commit()
        return objs[-1]
Beispiel #45
0
def loaddemofiles(source, force=False):
    """Load demo files."""
    s = stat(source)

    with open(source, 'rb') as fp:
        m = hashlib.md5()
        m.update(fp.read())
        checksum = "md5:{0}".format(m.hexdigest())

    # Create a file instance
    with db.session.begin_nested():
        f = FileInstance.create()
        f.set_uri(source, s.st_size, checksum)

    # Replace all objects associated files.
    ObjectVersion.query.update({ObjectVersion.file_id: str(f.id)})
    db.session.commit()
Beispiel #46
0
def loaddemofiles(source, force=False):
    """Load demo files."""
    s = stat(source)

    with open(source, 'rb') as fp:
        m = hashlib.md5()
        m.update(fp.read())
        checksum = "md5:{0}".format(m.hexdigest())

    # Create a file instance
    with db.session.begin_nested():
        f = FileInstance.create()
        f.set_uri(source, s.st_size, checksum)

    # Replace all objects associated files.
    ObjectVersion.query.update({ObjectVersion.file_id: str(f.id)})
    db.session.commit()
Beispiel #47
0
def delete_file_instance(obj: ObjectVersion):
    """Delete file on filesystem and mark as not readable."""
    current_app.logger.debug(f"Delete file instance: {str(obj)}")

    if obj.file_id:
        f = FileInstance.get(str(obj.file_id))  # type: FileInstance

        is_readable = f.readable

        # Mark file not readable
        f.readable = False

        # Remove the file on disk
        if is_readable:
            f.storage().delete()

    db.session.commit()
Beispiel #48
0
    def create_file(self, bucket, key, file_versions):
        """Create a single file with all versions."""
        objs = []
        for file_ver in file_versions:
            f = FileInstance.create().set_uri(
                file_ver['full_path'],
                file_ver['size'],
                'md5:{0}'.format(file_ver['checksum']),
            )
            obj = ObjectVersion.create(bucket, key).set_file(f)
            obj.created = arrow.get(
                file_ver['creation_date']).datetime.replace(tzinfo=None)
            objs.append(obj)

        # Set head version
        db.session.commit()
        return objs[-1]
Beispiel #49
0
def test_transfer_rsync(app, db, location):
    """Test factories.transfer_rsync function."""
    # config
    app.config['SIPSTORE_ARCHIVER_DIRECTORY_BUILDER'] = \
        'helpers:archive_directory_builder'
    app.config['SIPSTORE_ARCHIVER_METADATA_TYPES'] = ['test']
    # SIP
    sip = SIP.create()
    # SIPMetadataType
    mtype = SIPMetadataType(title='Test', name='test', format='json')
    db.session.add(mtype)
    # SIPMetadata
    mcontent = {'title': 'title', 'author': 'me'}
    meth = SIPMetadata(sip=sip, type=mtype, content=json.dumps(mcontent))
    db.session.add(meth)
    # SIPFile
    f = FileInstance.create()
    fcontent = b'weighted companion cube\n'
    f.set_contents(BytesIO(fcontent), default_location=location.uri)
    sfile = SIPFile(sip=sip, file=f, filepath='portal.txt')
    db.session.add(sfile)
    db.session.commit()

    # EXPORT
    folder = path.join(location.uri, 'lulz')
    params = {
        'server': '',
        'user': '',
        'destination': folder,
        'args': '-az'
    }
    factories.transfer_rsync(sip.id, params)

    # TEST
    assert not path.exists(path.join(location.uri, 'test'))
    assert path.isdir(folder)
    assert path.isdir(path.join(folder, 'files'))
    assert path.isfile(path.join(folder, 'files', 'portal.txt'))
    assert path.isdir(path.join(folder, 'metadata'))
    assert path.isfile(path.join(folder, 'metadata', 'test.json'))
    with open(path.join(folder, 'files', 'portal.txt'), 'rb') as fp:
        assert fp.read() == fcontent
    with open(path.join(folder, 'metadata', 'test.json'), 'r') as fp:
        assert json.loads(fp.read()) == mcontent
def process_x_cap_files(record, x_cap_files):
    """Process files, update record."""
    result = []
    old_keys = set(record.files.keys)
    used_keys = set()

    # Download new files.
    urls = {
        error.url
        for error in x_cap_files if error.condition and error.url
    }
    for url in urls:
        if url not in record.files:
            result.append(url)

            obj = ObjectVersion.create(bucket=record.files.bucket, key=url)
            obj.file = FileInstance.create()
            record.files.flush()

            record.files[url]['source_url'] = url

    # Update file key for external URLs.
    for error in x_cap_files:
        if error.url:
            error.update_file_key(error.url)

    # Calculate references.
    keyfunc = attrgetter('file_key')
    for key, errors in groupby(sorted(x_cap_files, key=keyfunc), keyfunc):
        if key is None:
            continue

        refs = extract_refs_from_errors(errors)
        if refs:
            used_keys.add(key)
            record.files[key]['refs'] = refs

    for key in old_keys - used_keys:
        record.files[key]['refs'] = []

    return result
def test_pyfilesystemstorage(app, db, dummy_location):
    """Test pyfs storage."""
    # Create bucket and object
    with db.session.begin_nested():
        b1 = Bucket.create()
        obj = ObjectVersion.create(b1, "LICENSE")
        obj.file = FileInstance.create()

    storage = PyFilesystemStorage(obj.file, base_uri=obj.bucket.location.uri)
    counter = dict(size=0)

    def callback(total, size):
        counter['size'] = size

    def test_file_save(data, **kwargs):
        stream = BytesIO(data)
        loc, size, checksum = storage.save(stream, progress_callback=callback,
                                           **kwargs)

        # Verify checksum, size and location.
        m = hashlib.md5()
        m.update(data)
        assert "md5:{0}".format(m.hexdigest()) == checksum

        assert size == len(data)
        assert loc == join(dummy_location.uri, str(obj.file.id)[0:2],
                           str(obj.file.id)[2:], 'data')

    data = b("this is some content")
    # test without size
    test_file_save(data)
    # test with correct size
    test_file_save(data, size=len(data))
    # test with wrong sizes
    with pytest.raises(UnexpectedFileSizeError):
        test_file_save(data, size=len(data) - 1)
    with pytest.raises(UnexpectedFileSizeError):
        test_file_save(data, size=len(data) + 1)
def upload_to_zenodo(bucket_id, filename):
    """Upload code to zenodo."""
    zenodo_server_url = current_app.config.get('ZENODO_SERVER_URL')
    params = {"access_token": current_app.config.get(
        'ZENODO_ACCESS_TOKEN')}
    filename = filename + '.tar.gz'

    r = requests.post(zenodo_server_url,
                      params=params, json={},
                      )

    file_obj = ObjectVersion.get(bucket_id, filename)
    file = FileInstance.get(file_obj.file_id)

    bucket_url = r.json()['links']['bucket']
    with open(file.uri, 'rb') as fp:
        response = requests.put(
            bucket_url + '/{}'.format(filename),
            data=fp,
            params=params,
        )

    return jsonify({"status": response.status_code})
    def upload(self, pid=None, *args, **kwargs):
        """Upload action for file/repository."""
        with UpdateDepositPermission(self).require(403):
            data = request.get_json()

            fileinfo = self._construct_fileinfo(data['url'],
                                                data['type'])
            if request:
                _, record = request.view_args.get('pid_value').data
                record_id = str(record.id)
                filename = fileinfo['filename']
                obj = ObjectVersion.create(
                    bucket=record.files.bucket, key=filename
                )
                obj.file = FileInstance.create()
                record.files.flush()
                record.files[filename]['source_url'] = data['url']

                if data['type'] == 'url':
                    if data['url'].startswith(
                            ('https://github',
                             'https://gitlab.cern.ch',
                             'root://')):
                        download_url.delay(record_id, data['url'], fileinfo)
                    else:
                        raise FileUploadError(
                            'Please provide a valid file url.')
                else:
                    if data['url'].startswith(
                            ('https://github', 'https://gitlab.cern.ch')):
                        download_repo.delay(record_id, data['url'], filename)
                    else:
                        raise FileUploadError(
                            'Please provide a valid repository url.')

            return self
Beispiel #54
0
def create_files_and_sip(deposit, dep_pid):
    """Create deposit Bucket, Files and SIPs."""
    from invenio_pidstore.errors import PIDDoesNotExistError
    from invenio_pidstore.models import PersistentIdentifier
    from invenio_sipstore.errors import SIPUserDoesNotExist
    from invenio_sipstore.models import SIP, RecordSIP, SIPFile
    from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion
    from invenio_records_files.models import RecordsBuckets
    from invenio_db import db
    buc = Bucket.create()
    recbuc = RecordsBuckets(record_id=deposit.id, bucket_id=buc.id)
    db.session.add(recbuc)
    deposit.setdefault('_deposit', dict())
    deposit.setdefault('_files', list())
    files = deposit.get('files', [])
    sips = deposit.get('sips', [])
    recid = None

    if sips:
        recids = [int(sip['metadata']['recid']) for sip in sips]
        if len(set(recids)) > 1:
            logger.error('Multiple recids ({recids}) found in deposit {depid}'
                         ' does not exists.'.format(recids=recids,
                                                    depid=dep_pid.pid_value))
            raise DepositMultipleRecids(dep_pid.pid_value, list(set(recids)))
        elif recids:  # If only one recid
            recid = recids[0]

    # Store the path -> FileInstance mappings for SIPFile creation later
    dep_file_instances = list()

    for file_ in files:
        fi = FileInstance.create()
        fi.set_uri(file_['path'], file_['size'], file_['checksum'])
        ov = ObjectVersion.create(buc, file_['name'], _file_id=fi.id)
        file_meta = dict(
            bucket=str(buc.id),
            key=file_['name'],
            checksum=file_['checksum'],
            size=file_['size'],
            version_id=str(ov.version_id),
        )
        deposit['_files'].append(file_meta)
        dep_file_instances.append((file_['path'], fi))

    for idx, sip in enumerate(sips):
        agent = None
        user_id = None
        if sip['agents']:
            agent = dict(
                ip_address=sip['agents'][0].get('ip_address', ""),
                email=sip['agents'][0].get('email_address', ""),
            )
            user_id = sip['agents'][0]['user_id']
        content = sip['package']
        sip_format = 'marcxml'
        try:
            sip = SIP.create(sip_format,
                             content,
                             user_id=user_id,
                             agent=agent)
        except SIPUserDoesNotExist:
            logger.exception('User ID {user_id} referred in deposit {depid} '
                             'does not exists.'.format(
                                 user_id=user_id, depid=dep_pid.pid_value))
            raise DepositSIPUserDoesNotExist(dep_pid.pid_value, user_id)

        # If recid was found, attach it to SIP
        # TODO: This is always uses the first recid, as we quit if multiple
        # recids are found in the sips information
        if recid:
            try:
                pid = PersistentIdentifier.get(pid_type='recid',
                                               pid_value=recid)
                record_sip = RecordSIP(sip_id=sip.id, pid_id=pid.id)
                db.session.add(record_sip)
            except PIDDoesNotExistError:
                logger.exception('Record {recid} referred in '
                                 'Deposit {depid} does not exists.'.format(
                                     recid=recid, depid=dep_pid.pid_value))
                raise DepositRecidDoesNotExist(dep_pid.pid_value, recid)
        if idx == 0:
            for fp, fi in dep_file_instances:
                sipf = SIPFile(sip_id=sip.id, filepath=fp, file_id=fi.id)
                db.session.add(sipf)
    deposit.commit()
    db.session.commit()
    return deposit
def sips(db, locations, sip_metadata_types):
    """Fixture for the SIP objects sharing multiple files.

    Four SIPs are sharing three files in the following way:
    SIP-1: File1
    SIP-2: File1, File2
    SIP-3: File2(renamed on SIPFile, but same FileInstance), File3
    SIP-4: File4, File5, File6
    """
    # A SIP with agent info
    sip1 = SIP.create(agent={
        'email': '*****@*****.**',
        'orcid': '1111-1111-1111-1111',
        'ip_address': '1.1.1.1'
    })
    sip1api = SIPApi(sip1)
    sip1api.attach_metadata('marcxml-test', '<p>XML 1</p>')
    sip1api.attach_metadata('json-test', '{"title": "JSON 1"}')
    # Metadata 'txt-test', although attached should not be archived
    # (see conftest configuration)
    sip1api.attach_metadata('txt-test', 'Title: TXT 1')
    file1 = FileInstance.create()
    file1.set_contents(BytesIO(b('test')),
                       default_location=locations['default'].uri)
    sip1file1 = SIPFile(sip_id=sip1.id, filepath="foobar.txt",
                        file_id=file1.id)

    db_.session.add(sip1file1)

    sip2 = SIP.create()
    sip2api = SIPApi(sip2)
    sip2api.attach_metadata('marcxml-test', '<p>XML 2</p>')
    sip2api.attach_metadata('json-test', '{"title": "JSON 2"}')
    file2 = FileInstance.create()
    file2.set_contents(BytesIO(b'test-second'),
                       default_location=locations['default'].uri)
    sip2file1 = SIPFile(sip_id=sip2.id, filepath="foobar.txt",
                        file_id=file1.id)
    sip2file2 = SIPFile(sip_id=sip2.id, filepath="foobar2.txt",
                        file_id=file2.id)

    db_.session.add(sip2file1)
    db_.session.add(sip2file2)

    sip3 = SIP.create()
    sip3api = SIPApi(sip3)
    sip3api.attach_metadata('marcxml-test', '<p>XML 3</p>')
    sip3api.attach_metadata('json-test', '{"title": "JSON 3"}')
    file3 = FileInstance.create()
    file3.set_contents(BytesIO(b'test-third'),
                       default_location=locations['default'].uri)
    sip3file2 = SIPFile(sip_id=sip3.id, filepath="foobar2-renamed.txt",
                        file_id=file2.id)
    sip3file3 = SIPFile(sip_id=sip3.id, filepath="foobar3.txt",
                        file_id=file3.id)

    db_.session.add(sip3file2)
    db_.session.add(sip3file3)

    # A SIP with naughty filenames
    sip4 = SIP.create()
    sip4api = SIPApi(sip4)
    sip4api.attach_metadata('marcxml-test', '<p>XML 4 żółć</p>')
    sip4api.attach_metadata('json-test', '{"title": "JSON 4 żółć"}')
    file4 = FileInstance.create()
    file4.set_contents(BytesIO('test-fourth żółć'.encode('utf-8')),
                       default_location=locations['default'].uri)
    file5 = FileInstance.create()
    file5.set_contents(BytesIO('test-fifth ąęćźə'.encode('utf-8')),
                       default_location=locations['default'].uri)

    file6 = FileInstance.create()
    file6.set_contents(BytesIO('test-sixth π'.encode('utf-8')),
                       default_location=locations['default'].uri)
    sip5file4 = SIPFile(sip_id=sip4.id, filepath="../../foobar.txt",
                        file_id=file4.id)

    sip5file5 = SIPFile(sip_id=sip4.id,
                        filepath="http://maliciouswebsite.com/hack.js",
                        file_id=file5.id)

    sip5file6 = SIPFile(sip_id=sip4.id,
                        filepath="łóżźćąę.dat",
                        file_id=file6.id)

    db_.session.add(sip5file4)
    db_.session.add(sip5file5)
    db_.session.add(sip5file6)

    # A SIP with metadata-only changes
    sip5 = SIP.create()
    sip5api = SIPApi(sip5)
    sip5api.attach_metadata('marcxml-test', '<p>XML 5 Meta Only</p>')

    db_.session.commit()
    return [sip1api, sip2api, sip3api, sip4api, sip5api]
Beispiel #56
0
def datasets(skip_files):
    """Load demo datasets records."""
    from invenio_db import db
    from invenio_records_files.api import Record
    from invenio_indexer.api import RecordIndexer
    from cernopendata.modules.records.minters.recid import \
        cernopendata_recid_minter
    from cernopendata.modules.records.minters.datasetid import \
        cernopendata_datasetid_minter

    from invenio_files_rest.models import \
        Bucket, FileInstance, ObjectVersion
    from invenio_records_files.models import RecordsBuckets

    indexer = RecordIndexer()
    schema = current_app.extensions['invenio-jsonschemas'].path_to_url(
        'records/datasets-v1.0.0.json'
    )
    data = pkg_resources.resource_filename('cernopendata',
                                           'modules/fixtures/data/datasets')
    datasets_json = glob.glob(os.path.join(data, '*.json'))

    # FIXME: change the treatment of `files` according to `records` fixtures.
    for filename in datasets_json:

        click.echo('Loading datasets from {0} ...'.format(filename))

        with open(filename, 'rb') as source:
            for data in json.load(source):
                files = data.pop('files', [])

                id = uuid.uuid4()
                # (TOFIX) Remove if statement in production
                # as every dataset record should have a doi
                if data.get('doi', None):
                    cernopendata_datasetid_minter(id, data)
                else:
                    cernopendata_recid_minter(id, data)
                data['$schema'] = schema
                record = Record.create(data, id_=id)

                bucket = Bucket.create()
                RecordsBuckets.create(
                    record=record.model, bucket=bucket)

                for file in files:
                    if skip_files:
                        break
                    assert 'uri' in file
                    assert 'size' in file
                    assert 'checksum' in file

                    f = FileInstance.create()
                    filename = file.get("uri").split('/')[-1:][0]
                    f.set_uri(file.get("uri"), file.get(
                        "size"), file.get("checksum"))

                    ObjectVersion.create(
                        bucket,
                        filename,
                        _file_id=f.id
                    )
                db.session.commit()
                indexer.index(record)
                db.session.expunge_all()
def test_fileinstance_validation(app, db, dummy_location):
    """Test validating the FileInstance."""
    f = FileInstance.create()
    f.set_uri('x' * 255, 1000, 1000)  # Should not raise
    pytest.raises(ValueError, f.set_uri, 'x' * 256, 1000, 1000)