def create_sips(cls, dump, deposit, files, recid): """Create submission information packages.""" if not recid or recid.status == PIDStatus.RESERVED: return first = True for s in dump.sips: # Create SIP sip = SIP.create( s['format'], s['content'], user_id=s['user_id'], agent=s['agent'], id_=s['id'], ) sip.created = s['timestamp'] # Create SIP files only for first package. if first: first = False for meta, f in files: db.session.add( SIPFile(sip_id=sip.id, filepath=meta['key'], file_id=f.id)) # PID - SIP relationship db.session.add(RecordSIP(sip_id=sip.id, pid_id=recid.id))
def reconstruct_sipfiles_t(recid=None, pid=None): """Reconstruct SIPFiles from record metadata.""" if not pid: pid = PersistentIdentifier.get('recid', recid) recsip = RecordSIP.query.filter_by(pid_id=pid.id).order_by( RecordSIP.created).first() if recsip is None: raise Exception("RecordSIP does not exist.") sip = recsip.sip record = Record.get_record(recsip.pid.object_uuid) first_json_rec = \ next((rec for rec in record.revisions if '_files' in rec), None) if first_json_rec is None: raise Exception("Files information not found in SIPMetadata nor" " in Record revision") files_j = first_json_rec['_files'] ovs = [ ObjectVersion.query.filter_by(version_id=fj['version_id']).first() for fj in files_j ] for ov in ovs: q = SIPFile.query.filter_by(sip_id=sip.id, filepath=ov.key, file_id=ov.file_id) if not q.count(): obj = SIPFile(sip_id=sip.id, filepath=ov.key, file_id=ov.file_id, created=sip.created) db.session.add(obj) db.session.commit()
def test_sip_file_model(app, db, sips): """Test the SIPFile model.""" sip = sips[0] app.config['SIPSTORE_FILEPATH_MAX_LEN'] = 15 with pytest.raises(ValueError) as excinfo: SIPFile(sip_id=sip.id, filepath="way too long file name.zip", file_id=sip.files[0].file_id) assert 'Filepath too long' in str(excinfo.value)
def test_sip_file_model(db): """Test the SIPFile model.""" sip1 = SIP.create('json', '{}') file1 = FileInstance.create() sipfile1 = SIPFile(sip_id=sip1.id, filepath="foobar.zip", file_id=file1.id) db.session.add(sipfile1) db.session.commit() assert SIP.query.count() == 1 assert SIPFile.query.count() == 1
def attach_file(self, file): """Add a file to the SIP. :param file: the file to attach. It must at least implement a `key` and a valid `file_id`. See :py:class:`invenio_files_rest.models.ObjectVersion`. :returns: the created SIPFile :rtype: :py:class:`invenio_sipstore.models.SIPFile` """ sf = SIPFile(sip_id=self.id, filepath=file.key, file_id=file.file_id) db.session.add(sf) return sf
def load_sipfile(file_id, filepath, sip_id, created): """Load a single SIPFile from parameters into DB.""" q = SIPFile.query.filter_by(sip_id=sip_id, filepath=filepath, file_id=file_id) dt_created = arrow.get(created).datetime.replace(tzinfo=None) if not q.count(): obj = SIPFile(sip_id=sip_id, filepath=filepath, file_id=file_id, created=dt_created) db.session.add(obj) db.session.commit()
def test_transfer_rsync(app, db, location): """Test factories.transfer_rsync function.""" # config app.config['SIPSTORE_ARCHIVER_DIRECTORY_BUILDER'] = \ 'helpers:archive_directory_builder' app.config['SIPSTORE_ARCHIVER_METADATA_TYPES'] = ['test'] # SIP sip = SIP.create() # SIPMetadataType mtype = SIPMetadataType(title='Test', name='test', format='json') db.session.add(mtype) # SIPMetadata mcontent = {'title': 'title', 'author': 'me'} meth = SIPMetadata(sip=sip, type=mtype, content=json.dumps(mcontent)) db.session.add(meth) # SIPFile f = FileInstance.create() fcontent = b'weighted companion cube\n' f.set_contents(BytesIO(fcontent), default_location=location.uri) sfile = SIPFile(sip=sip, file=f, filepath='portal.txt') db.session.add(sfile) db.session.commit() # EXPORT folder = path.join(location.uri, 'lulz') params = { 'server': '', 'user': '', 'destination': folder, 'args': '-az' } factories.transfer_rsync(sip.id, params) # TEST assert not path.exists(path.join(location.uri, 'test')) assert path.isdir(folder) assert path.isdir(path.join(folder, 'files')) assert path.isfile(path.join(folder, 'files', 'portal.txt')) assert path.isdir(path.join(folder, 'metadata')) assert path.isfile(path.join(folder, 'metadata', 'test.json')) with open(path.join(folder, 'files', 'portal.txt'), 'rb') as fp: assert fp.read() == fcontent with open(path.join(folder, 'metadata', 'test.json'), 'r') as fp: assert json.loads(fp.read()) == mcontent
def create(cls, pid, record, create_sip_files=True, user_id=None, agent=None): """Create a Zenodo SIP, from the PID and the Record. Apart from the SIP itself, it also creates ``RecordSIP`` for the SIP-PID-Record relationship, as well as ``SIPFile`` objects for each the files in the record. Those objects are not returned by this function but can be fetched by the corresponding SIP relationships 'record_sips' and 'sip_files'. :param pid: PID of the published record ('recid'). :type pid: `invenio_pidstore.models.PersistentIdentifier` :param record: Record for which the SIP should be created. :type record: `invenio_records.api.Record` :param create_sip_files: If True the SIPFiles will be created. :type create_sip_files: bool :returns: A Zenodo-specifi SIP object. :rtype: ``invenio_sipstore.models.SIP`` """ if not user_id: user_id = (None if current_user.is_anonymous else current_user.get_id()) if not agent: agent = cls._build_agent_info() with db.session.begin_nested(): sip = SIP.create('json', json.dumps(record.dumps()), user_id=user_id, agent=agent) recsip = RecordSIP(sip_id=sip.id, pid_id=pid.id) db.session.add(recsip) if record.files and create_sip_files: for f in record.files: sf = SIPFile(sip_id=sip.id, filepath=f.key, file_id=f.file_id) db.session.add(sf) return sip
def sips(db, locations, sip_metadata_types): """Fixture for the SIP objects sharing multiple files. Four SIPs are sharing three files in the following way: SIP-1: File1 SIP-2: File1, File2 SIP-3: File2(renamed on SIPFile, but same FileInstance), File3 SIP-4: File4, File5, File6 """ # A SIP with agent info sip1 = SIP.create( agent={ 'email': '*****@*****.**', 'orcid': '1111-1111-1111-1111', 'ip_address': '1.1.1.1' }) sip1api = SIPApi(sip1) sip1api.attach_metadata('marcxml-test', '<p>XML 1</p>') sip1api.attach_metadata('json-test', '{"title": "JSON 1"}') # Metadata 'txt-test', although attached should not be archived # (see conftest configuration) sip1api.attach_metadata('txt-test', 'Title: TXT 1') file1 = FileInstance.create() file1.set_contents(BytesIO(b('test')), default_location=locations['default'].uri) sip1file1 = SIPFile(sip_id=sip1.id, filepath="foobar.txt", file_id=file1.id) db_.session.add(sip1file1) sip2 = SIP.create() sip2api = SIPApi(sip2) sip2api.attach_metadata('marcxml-test', '<p>XML 2</p>') sip2api.attach_metadata('json-test', '{"title": "JSON 2"}') file2 = FileInstance.create() file2.set_contents(BytesIO(b'test-second'), default_location=locations['default'].uri) sip2file1 = SIPFile(sip_id=sip2.id, filepath="foobar.txt", file_id=file1.id) sip2file2 = SIPFile(sip_id=sip2.id, filepath="foobar2.txt", file_id=file2.id) db_.session.add(sip2file1) db_.session.add(sip2file2) sip3 = SIP.create() sip3api = SIPApi(sip3) sip3api.attach_metadata('marcxml-test', '<p>XML 3</p>') sip3api.attach_metadata('json-test', '{"title": "JSON 3"}') file3 = FileInstance.create() file3.set_contents(BytesIO(b'test-third'), default_location=locations['default'].uri) sip3file2 = SIPFile(sip_id=sip3.id, filepath="foobar2-renamed.txt", file_id=file2.id) sip3file3 = SIPFile(sip_id=sip3.id, filepath="foobar3.txt", file_id=file3.id) db_.session.add(sip3file2) db_.session.add(sip3file3) # A SIP with naughty filenames sip4 = SIP.create() sip4api = SIPApi(sip4) sip4api.attach_metadata('marcxml-test', '<p>XML 4 żółć</p>') sip4api.attach_metadata('json-test', '{"title": "JSON 4 żółć"}') file4 = FileInstance.create() file4.set_contents(BytesIO('test-fourth żółć'.encode('utf-8')), default_location=locations['default'].uri) file5 = FileInstance.create() file5.set_contents(BytesIO('test-fifth ąęćźə'.encode('utf-8')), default_location=locations['default'].uri) file6 = FileInstance.create() file6.set_contents(BytesIO('test-sixth π'.encode('utf-8')), default_location=locations['default'].uri) sip5file4 = SIPFile(sip_id=sip4.id, filepath="../../foobar.txt", file_id=file4.id) sip5file5 = SIPFile(sip_id=sip4.id, filepath="http://maliciouswebsite.com/hack.js", file_id=file5.id) sip5file6 = SIPFile(sip_id=sip4.id, filepath="łóżźćąę.dat", file_id=file6.id) db_.session.add(sip5file4) db_.session.add(sip5file5) db_.session.add(sip5file6) # A SIP with metadata-only changes sip5 = SIP.create() sip5api = SIPApi(sip5) sip5api.attach_metadata('marcxml-test', '<p>XML 5 Meta Only</p>') db_.session.commit() return [sip1api, sip2api, sip3api, sip4api, sip5api]
def create_files_and_sip(deposit, dep_pid): """Create deposit Bucket, Files and SIPs.""" from invenio_pidstore.errors import PIDDoesNotExistError from invenio_pidstore.models import PersistentIdentifier from invenio_sipstore.errors import SIPUserDoesNotExist from invenio_sipstore.models import SIP, RecordSIP, SIPFile from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets from invenio_db import db buc = Bucket.create() recbuc = RecordsBuckets(record_id=deposit.id, bucket_id=buc.id) db.session.add(recbuc) deposit.setdefault('_deposit', dict()) deposit.setdefault('_files', list()) files = deposit.get('files', []) sips = deposit.get('sips', []) recid = None if sips: recids = [int(sip['metadata']['recid']) for sip in sips] if len(set(recids)) > 1: logger.error('Multiple recids ({recids}) found in deposit {depid}' ' does not exists.'.format(recids=recids, depid=dep_pid.pid_value)) raise DepositMultipleRecids(dep_pid.pid_value, list(set(recids))) elif recids: # If only one recid recid = recids[0] # Store the path -> FileInstance mappings for SIPFile creation later dep_file_instances = list() for file_ in files: fi = FileInstance.create() fi.set_uri(file_['path'], file_['size'], file_['checksum']) ov = ObjectVersion.create(buc, file_['name'], _file_id=fi.id) file_meta = dict( bucket=str(buc.id), key=file_['name'], checksum=file_['checksum'], size=file_['size'], version_id=str(ov.version_id), ) deposit['_files'].append(file_meta) dep_file_instances.append((file_['path'], fi)) for idx, sip in enumerate(sips): agent = None user_id = None if sip['agents']: agent = dict( ip_address=sip['agents'][0].get('ip_address', ""), email=sip['agents'][0].get('email_address', ""), ) user_id = sip['agents'][0]['user_id'] content = sip['package'] sip_format = 'marcxml' try: sip = SIP.create(sip_format, content, user_id=user_id, agent=agent) except SIPUserDoesNotExist: logger.exception('User ID {user_id} referred in deposit {depid} ' 'does not exists.'.format( user_id=user_id, depid=dep_pid.pid_value)) raise DepositSIPUserDoesNotExist(dep_pid.pid_value, user_id) # If recid was found, attach it to SIP # TODO: This is always uses the first recid, as we quit if multiple # recids are found in the sips information if recid: try: pid = PersistentIdentifier.get(pid_type='recid', pid_value=recid) record_sip = RecordSIP(sip_id=sip.id, pid_id=pid.id) db.session.add(record_sip) except PIDDoesNotExistError: logger.exception('Record {recid} referred in ' 'Deposit {depid} does not exists.'.format( recid=recid, depid=dep_pid.pid_value)) raise DepositRecidDoesNotExist(dep_pid.pid_value, recid) if idx == 0: for fp, fi in dep_file_instances: sipf = SIPFile(sip_id=sip.id, filepath=fp, file_id=fi.id) db.session.add(sipf) deposit.commit() db.session.commit() return deposit
def create_files_and_sip(deposit, dep_pid): """Create deposit Bucket, Files and SIPs.""" from invenio_pidstore.errors import PIDDoesNotExistError from invenio_pidstore.models import PersistentIdentifier, PIDStatus from invenio_sipstore.errors import SIPUserDoesNotExist from invenio_sipstore.models import SIP, RecordSIP, SIPFile from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets from invenio_db import db buc = Bucket.create() recbuc = RecordsBuckets(record_id=deposit.id, bucket_id=buc.id) db.session.add(recbuc) deposit.setdefault('_deposit', dict()) deposit.setdefault('_buckets', dict(deposit=str(buc.id))) deposit.setdefault('_files', list()) files = deposit.get('files', []) sips = deposit.get('sips', []) # Look for prereserved DOI (and recid) if 'drafts' in deposit: drafts = list(deposit['drafts'].items()) if len(drafts) != 1: logger.exception('Deposit {dep_pid} has multiple drafts'.format( dep_pid=dep_pid)) if len(drafts) == 1: draft_type, draft = drafts[0] draft_v = draft['values'] if 'prereserve_doi' in draft_v: pre_recid = str(draft_v['prereserve_doi']['recid']) pre_doi = str(draft_v['prereserve_doi']['doi']) # If pre-reserve info available, try to reserve 'recid' try: pid = PersistentIdentifier.get(pid_type='recid', pid_value=str(pre_recid)) except PIDDoesNotExistError: # Reserve recid pid = PersistentIdentifier.create( pid_type='recid', pid_value=str(pre_recid), object_type='rec', status=PIDStatus.RESERVED) # If pre-reserve info available, try to reserve 'doi' try: pid = PersistentIdentifier.get(pid_type='doi', pid_value=str(pre_doi)) except PIDDoesNotExistError: # Reserve DOI pid = PersistentIdentifier.create( pid_type='doi', pid_value=str(pre_doi), object_type='rec', status=PIDStatus.RESERVED) if RecordIdentifier.query.get(int(pre_recid)) is None: RecordIdentifier.insert(int(pre_recid)) # Store the path -> FileInstance mappings for SIPFile creation later dep_file_instances = list() for file_ in files: size = file_['size'] key = file_['name'] # Warning: Assumes all checksums are MD5! checksum = 'md5:{0}'.format(file_['checksum']) fi = FileInstance.create() fi.set_uri(file_['path'], size, checksum) ov = ObjectVersion.create(buc, key, _file_id=fi.id) ext = splitext(ov.key)[1].lower() if ext.startswith('.'): ext = ext[1:] file_meta = dict( bucket=str(ov.bucket.id), key=ov.key, checksum=ov.file.checksum, size=ov.file.size, version_id=str(ov.version_id), type=ext, ) deposit['_files'].append(file_meta) dep_file_instances.append((file_['path'], fi)) # Get a recid from SIP information recid = None if sips: recids = [int(sip['metadata']['recid']) for sip in sips] if len(set(recids)) > 1: logger.error('Multiple recids ({recids}) found in deposit {depid}' ' does not exists.'.format(recids=recids, depid=dep_pid.pid_value)) raise DepositMultipleRecids(dep_pid.pid_value, list(set(recids))) elif recids: # If only one recid recid = recids[0] for idx, sip in enumerate(sips): agent = None user_id = None if sip['agents']: agent = dict( ip_address=empty_str_if_none(sip['agents'][0].get( 'ip_address', "")), email=empty_str_if_none(sip['agents'][0].get( 'email_address', "")), ) user_id = sip['agents'][0]['user_id'] if user_id == 0: user_id = None content = sip['package'] sip_format = 'marcxml' try: sip = SIP.create(sip_format, content, user_id=user_id, agent=agent) except SIPUserDoesNotExist: logger.exception('User ID {user_id} referred in deposit {depid} ' 'does not exists.'.format( user_id=user_id, depid=dep_pid.pid_value)) sip = SIP.create(sip_format, content, agent=agent) # Attach recid to SIP if recid: try: pid = PersistentIdentifier.get(pid_type='recid', pid_value=str(recid)) record_sip = RecordSIP(sip_id=sip.id, pid_id=pid.id) db.session.add(record_sip) except PIDDoesNotExistError: logger.exception('Record {recid} referred in ' 'Deposit {depid} does not exists.'.format( recid=recid, depid=dep_pid.pid_value)) if deposit['_p']['submitted'] == True: logger.exception('Pair {recid}/{depid} was submitted,' ' (should it be unpublished?).'.format( recid=recid, depid=dep_pid.pid_value)) else: logger.exception( 'Pair {recid}/{depid} was not submitted.'.format( recid=recid, depid=dep_pid.pid_value)) # Reserve recid pid = PersistentIdentifier.create(pid_type='recid', pid_value=str(recid), object_type='rec', status=PIDStatus.RESERVED) if RecordIdentifier.query.get(int(recid)) is None: RecordIdentifier.insert(int(recid)) if idx == 0: for fp, fi in dep_file_instances: sipf = SIPFile(sip_id=sip.id, filepath=fp, file_id=fi.id) db.session.add(sipf) deposit.commit() return deposit