def _process_single_fields(self, item: dict): # RDM field name # PURE json path self._add_field(item, 'title', ['title']) self._add_field(item, 'uuid', ['uuid']) self._add_field(item, 'pureId', ['pureId']) self._add_field(item, 'publicationDate', ['publicationStatuses', 0, 'publicationDate', 'year']) self._add_field(item, 'createdDate', ['info', 'createdDate']) self._add_field(item, 'pages', ['info','pages']) self._add_field(item, 'volume', ['info','volume']) self._add_field(item, 'journalTitle', ['info', 'journalAssociation', 'title', 'value']) self._add_field(item, 'journalNumber', ['info', 'journalNumber']) self._add_field(item, 'metadataModifBy', ['info', 'modifiedBy']) self._add_field(item, 'metadataModifDate', ['info', 'modifiedDate']) self._add_field(item, 'pure_link', ['info', 'portalUrl']) self._add_field(item, 'recordType', ['types', 0, 'value']) self._add_field(item, 'category', ['categories', 0, 'value']) self._add_field(item, 'peerReview', ['peerReview']) self._add_field(item, 'publicationStatus', ['publicationStatuses', 0, 'publicationStatuses', 0, 'value']) self._add_field(item, 'numberOfAuthors', ['totalNumberOfAuthors']) self._add_field(item, 'workflow', ['workflows', 0, 'value']) self._add_field(item, 'confidential', ['confidential']) self._add_field(item, 'publisherName', ['publisher', 'names', 0, 'value']) self._add_field(item, 'abstract', ['abstracts', 0, 'value']) self._add_field(item, 'managingOrganisationalUnit_name', ['managingOrganisationalUnit', 'names', 0, 'value']) self._add_field(item, 'managingOrganisationalUnit_uuid', ['managingOrganisationalUnit', 'uuid']) self._add_field(item, 'managingOrganisationalUnit_externalId', ['managingOrganisationalUnit', 'externalId']) # Access right value = get_value(item, ['openAccessPermissions', 0, 'value']) self.data['access_right'] = self._accessright_conversion(value) # Language value = get_value(item, ['languages', 0, 'value']) self.data['language'] = self._language_conversion(value)
def _get_contributor_name(self, item: object): first_name = get_value(item, ['name', 'firstName']) last_name = get_value(item, ['name', 'lastName']) if not first_name: first_name = '(first name not specified)' if not last_name: first_name = '(last name not specified)' self.sub_data['name'] = f'{last_name}, {first_name}'
def _populate_xml(self, item, name_space): # Dataset element body = ET.SubElement(self.root, "{%s}dataset" % name_space['dataset']) body.set('type', 'dataset') # Title (mandatory field) value = get_value(item, ['title']) if not value: return False self._sub_element(body, name_space['dataset'], 'title').text = value # Managing organisation (mandatory field) organisational_unit = self._sub_element(body, name_space['dataset'], 'managingOrganisation') self._add_attribute(item, organisational_unit, 'lookupId', ['managingOrganisationalUnit_externalId']) # Persons (mandatory field) self._add_persons(body, name_space, item) # Available date (mandatory field) date = self._sub_element(body, name_space['dataset'], 'availableDate') sub_date = self._sub_element(date, name_space['commons'], 'year') sub_date.text = get_value(item, ['publication_date']) # Publisher (mandatory field) publisher = self._sub_element(body, name_space['dataset'], 'publisher') # REVIEW!!!! publisher.set( 'lookupId', '45d22915-6545-4428-896a-8b8046191d5d') # Data not in rdm self._sub_element(publisher, name_space['dataset'], 'name').text = 'Test publisher' # Data not in rdm self._sub_element(publisher, name_space['dataset'], 'type').text = 'publisher' # Data not in rdm # Description value = get_value(item, ['abstract']) value = 'test description' if value: descriptions = self._sub_element(body, name_space['dataset'], 'descriptions') description = self._sub_element(descriptions, name_space['dataset'], 'description') description.set('type', 'datasetdescription') description.text = value # Links self._add_links(body, name_space) # Organisations self._add_organisations(body, name_space, item)
def get_files_data(self, item: dict): """ Gets metadata information from electronicVersions and additionalFiles files. It also downloads the relative files. The Metadata without file will be ignored """ if 'file' not in item: return False elif 'fileURL' not in item['file'] or 'fileName' not in item['file']: return False internal_review = False # Default value pure_file_size = get_value(item, ['file', 'size']) file_name = get_value(item, ['file', 'fileName']) file_url = get_value(item, ['file', 'fileURL']) self.pure_rdm_file_match = [] # Checks if pure_file_size and file_name are the same as any of the files in RDM with the same uuid for rdm_file in self.rdm_file_review: rdm_file_size = str(rdm_file['size']) rdm_review = rdm_file['review'] if pure_file_size == rdm_file_size and file_name == rdm_file['name']: self.pure_rdm_file_match.append(True) # Do the old and new file match? self.pure_rdm_file_match.append(rdm_review) # Was the old file reviewed? internal_review = rdm_review # The new uploaded file will have the same review value as in RDM break self.sub_data = {} self.sub_data['internalReview'] = internal_review self._add_subdata(item, 'name', ['file', 'fileName']) self._add_subdata(item, 'size', ['file', 'size']) self._add_subdata(item, 'mimeType', ['file', 'mimeType']) self._add_subdata(item, 'digest', ['file', 'digest']) self._add_subdata(item, 'digestAlgorithm', ['file', 'digestAlgorithm']) self._add_subdata(item, 'createdBy', ['creator']) self._add_subdata(item, 'createdDate', ['created']) self._add_subdata(item, 'versionType', ['versionTypes', 0, 'value']) self._add_subdata(item, 'licenseType', ['licenseTypes', 0, 'value']) # Access type value = get_value(item, ['accessTypes', 0, 'value']) self.sub_data['accessType'] = self._accessright_conversion(value) # Append to sub_data to .data self.data['versionFiles'].append(self.sub_data) # Download file from Pure response = get_pure_file(self, file_url, file_name) # Checks if the file is already in RDM, and if it has already been reviewed self._process_file_download_response(response, file_name)
def _add_field(self, item: list, rdm_field: str, path: list): """ Adds the field to the data json """ value = get_value(item, path) if value: self.data[rdm_field] = value return
def _process_person_associations(self): """ Process data ralative to the record contributors """ if 'personAssociations' not in self.item: return self.data['contributors'] = [] file_data = file_read_lines('user_ids_match') for item in self.item['personAssociations']: self.sub_data = {} self._get_contributor_name(item) self._add_subdata(item, 'uuid', ['person', 'uuid']) self._add_subdata(item, 'externalId', ['person', 'externalId']) self._add_subdata(item, 'authorCollaboratorName', ['authorCollaboration', 'names', 0, 'value']) self._add_subdata(item, 'personRole', ['personRoles', 0, 'value']) self._add_subdata(item, 'organisationalUnit', ['organisationalUnits', 0, 'names', 0, 'value']) self._add_subdata(item, 'type_p', ['externalPerson', 'types', 0, 'value']) self._add_subdata(item, 'uuid', ['externalPerson', 'uuid']) # Checks if the record owner is available in user_ids_match.txt person_external_id = get_value(item, ['person', 'externalId']) owner = self.general_functions.get_userid_from_list_by_externalid(person_external_id, file_data) if owner and int(owner) not in self.data['owners']: self.data['owners'].append(int(owner)) # ORCID self._process_contributor_orcid() self.data['contributors'].append(self.sub_data)
def _add_persons(self, body, name_space, item): persons = self._sub_element(body, name_space['dataset'], 'persons') for person_data in item['contributors']: person = self._sub_element(persons, name_space['dataset'], 'person') person.set('contactPerson', 'true') self._add_attribute(person_data, person, 'id', ['uuid']) # External id person_id = self._sub_element(person, name_space['dataset'], 'person') self._add_attribute(person_data, person_id, 'lookupId', ['externalId']) # Role role = self._sub_element(person, name_space['dataset'], 'role') role.text = get_value(person_data, ['personRole']) # Name name = self._sub_element(person, name_space['dataset'], 'name') name.text = get_value(person_data, ['name'])
def _add_links(self, body, name_space): """ Adds relative links for RDM files and api """ link_files = get_value(self.full_item, ['links', 'files']) link_self = get_value(self.full_item, ['links', 'self']) recid = get_value(self.full_item, ['id']) if link_files or link_self: links = self._sub_element(body, name_space['dataset'], 'links') # Files if link_files: link = self._sub_element(links, name_space['dataset'], 'link') link.set('id', recid) # REVIEW - which id? self._sub_element(link, name_space['dataset'], 'url').text = link_files self._sub_element(link, name_space['dataset'], 'description').text = 'Link to record files' # Self if link_self: link = self._sub_element(links, name_space['dataset'], 'link') link.set('id', recid) # REVIEW - which id? url = self._sub_element(link, name_space['dataset'], 'url').text = link_self self._sub_element(link, name_space['dataset'], 'description').text = 'Link to record API'
def _process_organisational_units(self): """ Process the metadata relative to the organisational units """ if 'organisationalUnits' in self.item: self.data['organisationalUnits'] = [] self.data['groupRestrictions'] = [] for i in self.item['organisationalUnits']: sub_data = {} organisational_unit_name = get_value(i, ['names', 0, 'value']) organisational_unit_uuid = get_value(i, ['uuid']) organisational_unit_externalId = get_value(i, ['externalId']) sub_data['name'] = organisational_unit_name sub_data['uuid'] = organisational_unit_uuid sub_data['externalId'] = organisational_unit_externalId self.data['organisationalUnits'].append(sub_data) # Adding organisational unit as group owner self.data['groupRestrictions'].append(organisational_unit_externalId) # Create group self.groups.rdm_create_group(organisational_unit_externalId, organisational_unit_name)
def _add_organisations(self, body, name_space, item): organisations = self._sub_element(body, name_space['dataset'], 'organisations') for unit_data in item['organisationalUnits']: # Pure dataset documentation: # Can be both an internal and external organisation, use origin to enforce either internal or external. # If the organisation is an internal organisation in Pure, then the lookupId attribute must be used. # If the organisation is an external organisation and id is given matching will be done on the id, # if not found mathching will be done on name, if still not found then an external # organisation with the specified id and organisation will be created. organisation = self._sub_element(organisations, name_space['dataset'], 'organisation') self._add_attribute(unit_data, organisation, 'lookupId', ['externalId']) name = self._sub_element(organisation, name_space['dataset'], 'name') name.text = get_value(unit_data, ['name'])
def _add_text(self, item: object, sub_element: object, path): """ Gets from the rdm response a value and adds it as text to a given xml element """ sub_element.text = get_value(item, path)
def _add_attribute(self, item: object, sub_element, attribute: str, value_path: list): """ Gets from the rdm response a value and adds it as attribute to a given xml element """ value = get_value(item, value_path) if value: sub_element.set(attribute, value)
def _add_subdata(self, item: list, rdm_field: str, path: list): """ Adds the field to sub_data """ value = get_value(item, path) if value: self.sub_data[rdm_field] = value