def parse_temporal_coverage(self): """ Find time coverage of the metadata """ tc = first(self._text_xpath(self.resource_info, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:timeCoverageInfo/cmd" ":timeCoverage/text()")) or \ first(self._text_xpath(self.resource_info, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusAudioInfo/cmd:timeCoverageInfo/cmd" ":timeCoverage/text()")) return tc
def _strip_first(elements): """ Strip and return first element. :param elements: list of xml elements :return: first element or none """ return (first(elements) or "").strip()
def _get_persons(cls, root, xpath): """ Extract person dictionary from XML using given Xpath. :param root: parent element (lxml) where selection is done :param xpath: xpath selector used to get data :return: list of person dictionaries """ return [{ 'role': cls._strip_first( person.xpath("cmd:role/text()", namespaces=cls.namespaces)), 'surname': cls._strip_first( person.xpath("cmd:personInfo/cmd:surname/text()", namespaces=cls.namespaces)), 'given_name': cls._strip_first( person.xpath("cmd:personInfo/cmd:givenName/text()", namespaces=cls.namespaces)), 'email': cls._strip_first( person.xpath( "cmd:personInfo/cmd:communicationInfo/cmd:email/text()", namespaces=cls.namespaces)), 'organization': first( cls._get_organizations(person, "cmd:personInfo/cmd:affiliation")) } for person in root.xpath(xpath, namespaces=cls.namespaces)]
def is_allowed_org_member_edit(group_dict, user_id, target_id, target_role): ''' Check if the user is allowed to edit an organization member :param group_dict: dict of all groups (organizations) :param user_id: user id :param target_id: target user id :param target_role: target's current role ''' target_role = getattr(target_role, 'original', target_role) user = fn.first(filter(lambda user: user.get('id') == user_id, group_dict['users'])) if not user: return False user_role = user.get('capacity') target_role = target_role.lower() if user.get('sysadmin'): return True for possible_role in ['admin', 'editor', 'member']: if settings.ORGANIZATION_MEMBER_PERMISSIONS.get((user_role, target_role, possible_role, user_id == target_id)): return True return False
def _get_org_auth(tag_tree): ''' Returns an iterator over organization-author dicts from metadata ''' def oai_dc(): ''' Get 'author' and 'organization' information from OAI-DC ''' for c in tag_tree(_filter_tag_name_namespace(name='creator', namespace=NS['dc']), recursive=False): yield {'org': '', 'value': c.string} def ida(): ''' Get 'author' and 'organization' information from IDA ''' for c in tag_tree(_filter_tag_name_namespace(name='contributor', namespace=NS['dct']), recursive=False): # Todo! Simplify this! if c.Person and c.Organization: yield {'org': c.Organization.find('name').string, 'value': c.Person.find('name').string} elif c.Person: yield {'org': '', 'value': c.Person.find('name').string} elif c.Organization: yield {'org': c.Organization.find('name').string, 'value': ''} return ida() if first(ida()) else oai_dc()
def _get_availability(self): """ Get availibility from description tags """ availability = first(self.dc(_filter_tag_name_namespace(name='availability', namespace=NS['cscida']), recursive=False)) if availability: return [availability.string.strip()] return self._get_description_values('availability')
def _strip_first(elements): """ Strip and return first element. :param elements: list of xml elements :return: first element or none """ return (first(elements) or "").strip()
def is_allowed_org_member_edit(group_dict, user_id, target_id, target_role): ''' Check if the user is allowed to edit an organization member :param group_dict: dict of all groups (organizations) :param user_id: user id :param target_id: target user id :param target_role: target's current role ''' target_role = getattr(target_role, 'original', target_role) user = fn.first(filter(lambda user: user.get('id') == user_id, group_dict['users'])) if not user: return False user_role = user.get('capacity') target_role = target_role.lower() if user.get('sysadmin'): return True for possible_role in ['admin', 'editor', 'member']: if settings.ORGANIZATION_MEMBER_PERMISSIONS.get((user_role, target_role, possible_role, user_id == target_id)): return True return False
def _get_org_auth(tag_tree): ''' Returns an iterator over organization-author dicts from metadata ''' def oai_dc(): ''' Get 'author' and 'organization' information from OAI-DC ''' for c in tag_tree(_filter_tag_name_namespace(name='creator', namespace=NS['dc']), recursive=False): yield {'org': '', 'value': c.string} def ida(): ''' Get 'author' and 'organization' information from IDA ''' for c in tag_tree(_filter_tag_name_namespace(name='contributor', namespace=NS['dct']), recursive=False): # Todo! Simplify this! if c.Person and c.Organization: yield { 'org': c.Organization.find('name').string, 'value': c.Person.find('name').string } elif c.Person: yield {'org': '', 'value': c.Person.find('name').string} elif c.Organization: yield {'org': c.Organization.find('name').string, 'value': ''} return ida() if first(ida()) else oai_dc()
def _get_uploader(self): ''' Get uploader from cscida tags :return ''' uploader = first(self.dc(_filter_tag_name_namespace(name='uploader', namespace=NS['cscida']), recursive=False)) if uploader: return uploader.string.strip() return ''
def _get_mime_type(self): ''' Get general.mime_type from data ''' mime_type = first(self.dc(_filter_tag_name_namespace(name='general.mime_type', namespace=NS['cscida']), recursive=False)) if mime_type: return mime_type.string.strip() return self._get_description_value('general.mime_type')
def _get_availability(self): """ Get availibility from description tags """ availability = first( self.dc(_filter_tag_name_namespace(name='availability', namespace=NS['cscida']), recursive=False)) if availability: return [availability.string.strip()] return self._get_description_values('availability')
def _get_contact_email(self, pkg_id, contact_id): recipient = None if contact_id: contacts = utils.get_package_contacts(pkg_id) contact = fn.first(filter(lambda c: c.get('id') == contact_id, contacts)) if contact and 'email' in contact.keys(): email = contact.get('email') name = contact.get('name') recipient = {'name': name, 'email': email} return recipient
def _get_contact_email(self, pkg_id, contact_id): recipient = None if contact_id: contacts = utils.get_package_contacts(pkg_id) contact = fn.first(filter(lambda c: c.get('id') == contact_id, contacts)) if contact and 'email' in contact.keys(): email = contact.get('email') name = contact.get('name') recipient = {'name': name, 'email': email} return recipient
def _get_persons(cls, root, xpath): """ Extract person dictionary from XML using given Xpath. :param root: parent element (lxml) where selection is done :param xpath: xpath selector used to get data :return: list of person dictionaries """ return [{'role': cls._strip_first(person.xpath("cmd:role/text()", namespaces=cls.namespaces)), 'surname': cls._strip_first(person.xpath("cmd:personInfo/cmd:surname/text()", namespaces=cls.namespaces)), 'given_name': cls._strip_first(person.xpath("cmd:personInfo/cmd:givenName/text()", namespaces=cls.namespaces)), 'email': cls._strip_first(person.xpath("cmd:personInfo/cmd:communicationInfo/cmd:email/text()", namespaces=cls.namespaces)), 'organization': first(cls._get_organizations(person, "cmd:personInfo/cmd:affiliation"))} for person in root.xpath(xpath, namespaces=cls.namespaces)]
def _get_mime_type(self): ''' Get general.mime_type from data ''' mime_type = first( self.dc(_filter_tag_name_namespace(name='general.mime_type', namespace=NS['cscida']), recursive=False)) if mime_type: return mime_type.string.strip() return self._get_description_value('general.mime_type')
def _get_uploader(self): ''' Get uploader from cscida tags :return ''' uploader = first( self.dc(_filter_tag_name_namespace(name='uploader', namespace=NS['cscida']), recursive=False)) if uploader: return uploader.string.strip() return ''
def get_member_role(group_id, user_id): """ Get the user's role for this group. :param group_id: Group ID :param user_id: User ID :rtype: list of strings """ query = model.Session.query(model.Member) \ .filter(model.Member.group_id == group_id) \ .filter(model.Member.table_name == 'user') \ .filter(model.Member.state == 'active') \ .filter(model.Member.table_id == user_id) return fn.first([group.capacity for group in query.all()])
def _get_project_stuff(tag_tree): ''' Get project_funder, project_funding, project_name, project_homepage :param tag_tree: metadata (dc) element in BeautifulSoup tree ''' def ida(): for a in tag_tree(_filter_tag_name_namespace(name='contributor', namespace=NS['dct']), recursive=False): if a.Project: funder_funding = a.Project.comment.string.split(u' rahoituspäätös ') if a.Project.comment else ('', '') name = a.Project.find('name').string if a.Project.find('name') else '' about = a.Project.get('about', '') yield tuple(funder_funding) + (name,) + (about,) return zip(*ida()) if first(ida()) else None
def get_member_role(group_id, user_id): """ Get the user's role for this group. :param group_id: Group ID :param user_id: User ID :rtype: list of strings """ query = model.Session.query(model.Member) \ .filter(model.Member.group_id == group_id) \ .filter(model.Member.table_name == 'user') \ .filter(model.Member.state == 'active') \ .filter(model.Member.table_id == user_id) return fn.first([group.capacity for group in query.all()])
def _get_project_stuff(tag_tree): ''' Get project_funder, project_funding, project_name, project_homepage :param tag_tree: metadata (dc) element in BeautifulSoup tree ''' def ida(): for a in tag_tree(_filter_tag_name_namespace(name='contributor', namespace=NS['dct']), recursive=False): if a.Project: funder_funding = a.Project.comment.string.split( u' rahoituspäätös ') if a.Project.comment else ('', '') name = a.Project.find('name').string if a.Project.find( 'name') else '' about = a.Project.get('about', '') yield tuple(funder_funding) + (name, ) + (about, ) return zip(*ida()) if first(ida()) else None
def __init__(self, xml, provider=None): """ Initialize the helper for parsing the given xml. :param xml: an lxml object, representing a CMDI record """ cmd = first(xml.xpath('//oai:record/oai:metadata/cmd:CMD', namespaces=CmdiParseHelper.namespaces)) if cmd is None: raise CmdiParseException( "Unexpected XML format: No CMD -element found") resource_info = cmd.xpath( "//cmd:Components/cmd:resourceInfo", namespaces=CmdiParseHelper.namespaces)[0] if resource_info is None: raise CmdiParseException( "Unexpected XML format: No resourceInfo -element found") self.xml = xml self.cmd = cmd self.resource_info = resource_info self.provider = provider or config.get('ckan.site_url')
def parse_license(self): """ Find the license for the metadata """ return first(self._text_xpath( self.resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licence/text()"))
def parse_modified(self): """ Find date when metadata was last modified """ return first(self._text_xpath(self.resource_info, "//cmd:metadataInfo/cmd:metadataLastDateUpdated/text()"))
def ddi25_mapper(xml): """ Convert given DDI 2.5 XML into MetaX format dict. :param xml: xml element (lxml) :return: dictionary """ namespaces = {'oai': "http://www.openarchives.org/OAI/2.0/", 'ddi': "ddi:codebook:2_5"} cb = first(xml.xpath('//oai:record/oai:metadata/ddi:codeBook', namespaces=namespaces)) stdy = cb.find('ddi:stdyDscr', namespaces) # Preferred identifier pref_id = None id_nos = stdy.findall('ddi:citation/ddi:titlStmt/ddi:IDNo', namespaces) id_no = first(filter(lambda x: x.get('agency') == 'Kansalliskirjasto', id_nos)) if id_no is not None: pref_id = id_no.text # Title title = {} titl = stdy.findall('ddi:citation/ddi:titlStmt/ddi:titl', namespaces) if len(titl): for t in titl: title[get_tag_lang(t)] = t.text # Creator # Assume that 'AuthEnty' tags for different language 'citations' are in same order creators = [] try: for i, citation in enumerate(stdy.findall('ddi:citation', namespaces)): for j, author in enumerate(citation.xpath( 'ddi:rspStmt/ddi:AuthEnty|ddi:rspStmt/ddi:othId', namespaces=namespaces)): agent_obj = {'name': None} if 'affiliation' in author.keys(): org = author.get('affiliation') if i == 0: agent_obj['@type'] = 'Person' if org is not None: agent_obj['member_of'] = { 'name': { get_tag_lang(author): org}, '@type': 'Organization'} # TODO: Check here that othIds are handled correctly agent_obj['name'] = author.text.strip() creators.append(agent_obj) elif org is not None: creators[j]['member_of']['name'][get_tag_lang(author)] = org else: if i == 0: agent_obj['@type'] = 'Organization' agent_obj['name'] = {get_tag_lang(author): author.text.strip()} creators.append(agent_obj) else: creators[j]['name'][get_tag_lang(author)] = author.text.strip() if author.tag.split('}')[1] == 'othId': log.info('Tag "othId" found, check it is correctly parsed(TODO)!') except Exception as e: log.error('Error parsing "creators": {0}: {1}. Check that different ' 'language elements match at the source.'.format(e.__class__.__name__, e)) raise # Modified modified = None ver_stmt = stdy.find('ddi:citation/ddi:verStmt/ddi:version', namespaces) if ver_stmt is not None and ver_stmt.get('date'): modified = get_string_as_valid_datetime_string(ver_stmt.get('date'), '01-01') # Description description = {} try: for abstract in stdy.findall('ddi:stdyInfo/ddi:abstract', namespaces): description[get_tag_lang(abstract)] = unicode(abstract.text).strip() except Exception as e: log.error('Error parsing "description": {0}: {1}'.format(e.__class__.__name__, e)) raise # Keywords keywords = [] for kw in stdy.findall('ddi:stdyInfo/ddi:subject/ddi:keyword', namespaces): keywords.append(kw.text.strip()) vocab = 'CESSDA Topic Classification' for cterm in stdy.findall("ddi:stdyInfo/ddi:subject/ddi:topcClas[@vocab='{0}']".format(vocab), namespaces): keywords.append(cterm.text.strip()) # Field of science codes = set() for fos in stdy.findall("ddi:stdyInfo/ddi:subject/ddi:topcClas[@vocab='OKM']", namespaces): field = 'label.' + get_tag_lang(fos) codes.add(get_ref_data('field_of_science', field, fos.text.strip(), 'code')) field_of_science = [{'identifier': c} for c in codes ] if not len(field_of_science): log.debug("No 'field of science' found.") field_of_science.append({'identifier': 'ta5'}) # Publisher publisher = { 'name': {}, '@type': 'Organization', "homepage": { "title": { "en": "Publisher website", "fi": "Julkaisijan kotisivu"}, "identifier": ""} } for dist in stdy.findall('ddi:citation/ddi:distStmt', namespaces): distr = dist.find('ddi:distrbtr', namespaces) publisher['name'][get_tag_lang(distr)] = distr.text.strip() publisher['homepage']['identifier'] = distr.get('URI') # Temporal coverage tpath = "ddi:stdyInfo/ddi:sumDscr/ddi:{tag}[@event='{ev}']" tstart = stdy.find(tpath.format(tag='timePrd', ev='start'), namespaces) or\ stdy.find(tpath.format(tag='collDate', ev='start'), namespaces) tend = stdy.find(tpath.format(tag='timePrd', ev='end'), namespaces) or\ stdy.find(tpath.format(tag='collDate', ev='end'), namespaces) if tstart is None and tend is None: tstart = stdy.find(tpath.format(tag='timePrd', ev='single'), namespaces) or\ stdy.find(tpath.format(tag='collDate', ev='single'), namespaces) tend = tstart elif tstart is None or tend is None: log.error('No temporal coverage or only start or end date in dataset!') temporal_coverage_obj_1 = {} if tstart is not None and tstart.get('date'): start_dt = get_string_as_valid_datetime_string(tstart.get('date'), '01-01', '00:00:00') if start_dt is None: temporal_coverage_obj_1['temporal_coverage'] = tstart.get('date') if tend is not None and tend.get('date'): temporal_coverage_obj_1['temporal_coverage'] += ' - ' + tend.get('date') else: temporal_coverage_obj_1['start_date'] = start_dt if tend is not None and tend.get('date'): end_dt = get_string_as_valid_datetime_string(tend.get('date'), '12-31', '23:59:59') if end_dt is not None: temporal_coverage_obj_1['end_date'] = end_dt # Provenance universe = {} univ = stdy.findall("ddi:stdyInfo/ddi:sumDscr/ddi:universe", namespaces) for u in univ: universe[get_tag_lang(u)] = u.text.strip() provenance = [{'title': {'en': 'Collection'}, 'description': { 'en': 'Contains the date(s) when the data were collected.'}, 'variable': [{'pref_label': universe}] }] if temporal_coverage_obj_1: provenance[0]['temporal'] = temporal_coverage_obj_1 # Production prod = stdy.find('ddi:citation/ddi:prodStmt/ddi:prodDate', namespaces) if prod is not None: temporal_coverage_obj_2 = {} if prod.text: start_dt = get_string_as_valid_datetime_string(prod.text.strip(), '01-01', '00:00:00') if start_dt is None: temporal_coverage_obj_2['temporal_coverage'] = prod.text.strip() else: temporal_coverage_obj_2['start_date'] = start_dt temporal_coverage_obj_2['end_date'] = get_string_as_valid_datetime_string(prod.text.strip(), '12-31', '23:59:59') provenance.append( {'title': {'en': 'Production'}, 'description': {'en': 'Date when the data collection were' ' produced (not distributed or archived)'}}) if temporal_coverage_obj_2: provenance[1]['temporal'] = temporal_coverage_obj_2 # Geographical coverage spatial = [{}] lang_attr = '{http://www.w3.org/XML/1998/namespace}lang' lang_path = "ddi:stdyInfo/ddi:sumDscr/ddi:nation[@{la}='{lt}']" nat_fi = stdy.find(lang_path.format(la=lang_attr, lt='fi'), namespaces) nat_en = stdy.find(lang_path.format(la=lang_attr, lt='en'), namespaces) if nat_en is not None: spatial = [{'geographic_name': nat_en.text.strip()}] if nat_fi is not None: # Assume Finland so search ES for Finnish place names: 'nat_fi' spat_id = get_ref_data('location', 'label.fi', nat_fi.text.strip(), 'code') if spat_id is not None: spatial[0]['place_uri'] = {'identifier': spat_id} if spatial[0].get('geographic_name') is None: spatial[0]['geographic_name'] = nat_fi.text.strip() package_dict = { "preferred_identifier": pref_id, "title": title, "creator": creators, "description": description, "keyword": keywords, "field_of_science": field_of_science, "publisher": publisher, "provenance": provenance, "spatial": spatial } if modified is not None: package_dict['modified'] = modified if temporal_coverage_obj_1: package_dict['temporal'] = [temporal_coverage_obj_1] return package_dict
def _get_description_value(self, key): return first(self._get_description_values(key))
def get_funder(data_dict): '''Get a single funder from agent field in data_dict''' return fn.first(get_funders(data_dict))
def fsd_refiner(context, data_dict): """ Refines the given MetaX data dict in a FSD-specific way :param context: Dictionary with an lxml-field :param data_dict: Dataset dictionary in MetaX format """ namespaces = {'oai': "http://www.openarchives.org/OAI/2.0/", 'ddi': "ddi:codebook:2_5"} ACCESS_RIGHTS = [{ 'match': r"The dataset is \(A\)", 'license': 'other-open', 'access_type': 'open'}, { 'match': r"The dataset is \(B\)", 'license': 'other-closed', 'access_type': 'restricted', 'restriction_grounds': ['education', 'research']}, { 'match': r"The dataset is \(C\)", 'license': 'other-closed', 'access_type': 'restricted', 'restriction_grounds': ['research']}, { 'match': r"The dataset is \(D\)", 'license': 'other-closed', 'access_type': 'restricted'}] package_dict = data_dict xml = context.get('source_data') cb = first(xml.xpath('//oai:record/oai:metadata/ddi:codeBook', namespaces=namespaces)) # Language languages = [get_tag_lang(fn) for fn in cb.findall( 'ddi:fileDscr/ddi:fileTxt/ddi:fileName', namespaces)] language_list = [{'identifier': get_language_identifier( convert_language(lang))} for lang in languages] package_dict['language'] = language_list # Licence and access type if 'access_rights' not in package_dict: package_dict['access_rights'] = {} restriction = {} for res in cb.findall('ddi:stdyDscr/ddi:dataAccs/ddi:useStmt/ddi:restrctn', namespaces): restriction[get_tag_lang(res)] = res.text.strip() if len(restriction.get('en', '')): for ar in ACCESS_RIGHTS: if re.match(ar['match'], restriction.get('en', '')): package_dict['access_rights']['license'] = [{ 'identifier': ar['license'], 'description': restriction}] package_dict['access_rights']['access_type'] = { 'identifier': ar['access_type']} restriction_grounds = [] for rg in ar.get('restriction_grounds', []): restriction_grounds.append({'identifier': rg}) if restriction_grounds: package_dict['access_rights']['restriction_grounds'] = restriction_grounds break if package_dict['access_rights'].get('license') is None: log.error('Unknown licence in dataset') conditions = {} for cond in cb.findall('ddi:stdyDscr/ddi:dataAccs/ddi:useStmt/ddi:conditions', namespaces): conditions[get_tag_lang(cond)] = cond.text.strip() if len(conditions): package_dict['access_rights']['description'] = conditions if 'access_type' not in package_dict['access_rights']: package_dict['access_rights']['access_type'] = { 'identifier': 'http://uri.suomi.fi/codelist/fairdata/access_type/code/restricted' } # Add old pid old_pids_path = os.path.dirname(__file__) + '/resources/fsd_pid_to_kata_urn.csv' set_existing_kata_identifier_to_other_identifier( old_pids_path, package_dict['preferred_identifier'], package_dict) return package_dict
def get_distributor(data_dict): '''Get a single distributor from agent field in data_dict''' return fn.first(get_distributors(data_dict))
def get_funder(data_dict): '''Get a single funder from agent field in data_dict''' return fn.first(get_funders(data_dict))
def read_data(self, xml): """ Extract package data from given XML. :param xml: xml element (lxml) :return: dictionary """ cmd = first(xml.xpath('//oai:record/oai:metadata/cmd:CMD', namespaces=self.namespaces)) if cmd is None: raise CmdiReaderException("Unexpected XML format: No CMD -element found") resource_info = cmd.xpath("//cmd:Components/cmd:resourceInfo", namespaces=self.namespaces)[0] if resource_info is None: raise CmdiReaderException("Unexpected XML format: No resourceInfo -element found") metadata_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:identifier/text()") data_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:url/text()") languages = self._text_xpath(cmd, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:languageInfo/cmd:languageId/text()") # convert the descriptions to a JSON string of type {"fin":"kuvaus", "eng","desc"} desc_json = {} for desc in xml.xpath("//cmd:identificationInfo/cmd:description", namespaces=self.namespaces): lang = utils.convert_language(desc.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip()) desc_json[lang] = unicode(desc.text).strip() description = json.dumps(desc_json) # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"} transl_json = {} for title in xml.xpath('//cmd:identificationInfo/cmd:resourceName', namespaces=self.namespaces): lang = utils.convert_language(title.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip()) transl_json[lang] = title.text.strip() title = json.dumps(transl_json) version = first(self._text_xpath(resource_info, "//cmd:metadataInfo/cmd:metadataLastDateUpdated/text()")) or "" coverage = first(self._text_xpath(resource_info, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:timeCoverageInfo/cmd:timeCoverage/text()")) or "" license_identifier = first(self._text_xpath(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licence/text()")) or 'notspecified' primary_pid = None provider = self.provider pids = [] for pid in [dict(id=pid, provider=provider, type='metadata') for pid in metadata_identifiers]: if 'urn' in pid.get('id', ""): primary_pid = pid['id'] else: pids.append(pid) pids += [dict(id=pid, provider=provider, type='data', primary=data_identifiers.index(pid) == 0) for pid in data_identifiers] temporal_coverage_begin = "" temporal_coverage_end = "" if coverage: split = [item.strip() for item in coverage.split("-")] if len(split) == 2: temporal_coverage_begin = split[0] temporal_coverage_end = split[1] # TODO: Check agent mapping. #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorPerson") #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson") #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson") #print "###", _get_persons(resource_info, "//cmd:contactPerson") #print "###", _get_persons(resource_info, "//cmd:metadataInfo/cmd:metadataCreator") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorOrganization") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization") contacts = self._persons_as_contact(self._get_persons(resource_info, "//cmd:contactPerson")) agents = [] agents.extend(self._persons_as_agent(self._get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson"), 'author')) agents.extend(self._persons_as_agent(self._get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson"), 'owner')) agents.extend(self._organization_as_agent(self._get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization"), 'author')) agents.extend(self._organization_as_agent(self._get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization"), 'owner')) result = {'name': self._to_name(primary_pid or first(metadata_identifiers)), 'language': ",".join(languages), 'pids': pids, 'version': version, 'notes': description, #'langtitle': titles, 'title': title, 'type': 'dataset', 'contact': contacts, 'agent': agents, 'availability': 'contact_owner', 'temporal_coverage_begin': temporal_coverage_begin, 'temporal_coverage_end': temporal_coverage_end, 'license_id': license_identifier} if not languages: result['langdis'] = u'True' if primary_pid: result['id'] = primary_pid # TODO: Ask about distributionAccessMedium # _strip_first(_text_xpath(resource_info, "//cmd:distributionInfo/availability/text()")) # url = _strip_first(_text_xpath(resource_info, "//cmd:identificationInfo/cmd:url/text()")) download_location = first(self._text_xpath(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:downloadLocation/text()")) if download_location: result['through_provider_URL'] = download_location result['availability'] = 'through_provider' return result
def _read(self): project_funder, project_funding, project_name, project_homepage = _get_project_stuff(self.dc) or ('', '', '', '') # Todo! This needs to be improved to use also simple-dc # dc(filter_tag_name_namespace('publisher', ns['dc']), recursive=False) availability, license_id, license_url, access_application_url = _get_rights(self.dc) or ('', '', '', '') if not availability: availability = first(self._get_availability()) uploader = self._get_uploader() data_pids = list(_get_data_pids(self.dc)) tags = [] #for tag in sorted([a.string for a in self.dc('subject', recursive=False)]): # tags.extend(self._resolve_tags(tag)) tags = [a.string for a in self.dc('subject', recursive=False)] transl_json = {} for title in self.dc('title', recursive=False): lang = utils.convert_language(title.get('xml:lang', '').strip()) transl_json[lang] = title.string.strip() title = json.dumps(transl_json) def _get_primary_pid(data_pids): for dpid in data_pids: if dpid.startswith('urn:nbn:fi:csc-ida'): data_pids.remove(dpid) return [dpid] return [] # Create a unified internal harvester format dict unified = dict( # ?=dc('source', recursive=False), # ?=dc('relation', recursive=False), # ?=dc('type', recursive=False), access_application_URL=access_application_url or '', # Todo! Implement access_request_URL='', algorithm=first(_get_algorithm(self.dc)) or '', # TODO: Handle availabilities better availability=availability, checksum=_get_checksum(self.dc) or '', direct_download_URL=first(_get_download(self.dc)) or '', # Todo! Implement discipline='', # Todo! Should be possible to implement with QDC, but not with OAI_DC # evdescr=[], # evtype=[], # evwhen=[], # evwho=[], # Todo! Implement geographic_coverage='', #langtitle=[dict(lang=a.get('xml:lang', ''), value=a.string) for a in self.dc('title', recursive=False)], title=title, language=','.join(sorted([a.string for a in self.dc('language', recursive=False)])), license_URL=license_url or '', license_id=license_id or 'notspecified', # Todo! Using only the first entry, for now contact=[dict(name=name or "", email=email or "", URL=url or "", phone=phone or "") for name, email, phone, url in self._get_maintainer_stuff()], # Todo! IDA currently doesn't produce this, maybe in future # dc('hasFormat', recursive=False) mimetype=self._get_mime_type(), notes=self._read_notes(), # Todo! Using only the first entry, for now # owner=first([a.get('resource') for a in dc('rightsHolder', recursive=False)]) or '', pids=[dict(id=pid, provider=_get_provider(self.bs), type=u'primary') for pid in _get_primary_pid(data_pids)] + [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in data_pids] + [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in self._get_version_pids()] + [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in _get_metadata_pid(self.dc)], agent=[dict(role='author', name=orgauth.get('value', ''), id='', organisation=orgauth.get('org', ''), URL='', fundingid='') for orgauth in _get_org_auth(self.dc)] + [dict(role='contributor', name=contributor.get('value', ''), id='', organisation=contributor.get('org', ''), URL='', fundingid='') for contributor in _get_contributor(self.dc)] + [dict(role='funder', name=first(project_name) or '', id=first(project_name) or '', organisation=first(project_funder) or "", URL=first(project_homepage) or '', fundingid=first(project_funding) or '',)] + [dict(role='owner', name=first([a.get('resource') for a in self.dc('rightsHolder', recursive=False)]) or first(_get_rightsholder(self.dc)) or '', id='', organisation='', URL='', fundingid='')], tag_string=','.join(tags) or '', # Todo! Implement if possible temporal_coverage_begin='', temporal_coverage_end='', type='dataset', uploader=uploader, # Used in smear harvest code to extract variable, station and year values, but is not used when # creating the dataset via API. smear_url=first(_get_download(self.dc, False)) or '', # Todo! This should be more exactly picked version=(self.dc.modified or self.dc.date).string if (self.dc.modified or self.dc.date) else '', # version=dc( # partial(filter_tag_name_namespace, 'modified', ns['dct']), recursive=False)[0].string or dc( # partial(filter_tag_name_namespace, 'date', ns['dc']), recursive=False)[0].string, ) if not unified['language']: unified['langdis'] = 'True' # Create id and name unified['id'] = generate_pid() unified['name'] = pid_to_name(unified['id']) # If primary pid is missing, set package id as primary pid if not any(pid.get('type', None) == u'primary' for pid in unified['pids']): unified['pids'].append(dict(id=unified['id'], type=u'primary', provider=None)) # if not unified['project_name']: # unified['projdis'] = 'True' return unified
def _get_mime_type(self): return first([a.string for a in self.dc('format', text=re.compile('/'), recursive=False)]) or ''
def _get_description_value(self, key): return first(self._get_description_values(key))
def read_data(self, xml): """ Extract package data from given XML. :param xml: xml element (lxml) :return: dictionary """ cmd = first(xml.xpath('//oai:record/oai:metadata/cmd:CMD', namespaces=self.namespaces)) if cmd is None: raise CmdiReaderException("Unexpected XML format: No CMD -element found") resource_info = cmd.xpath("//cmd:Components/cmd:resourceInfo", namespaces=self.namespaces)[0] if resource_info is None: raise CmdiReaderException("Unexpected XML format: No resourceInfo -element found") metadata_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:identifier/text()") languages = self._text_xpath(cmd, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:languageInfo/cmd:languageId/text()") # convert the descriptions to a JSON string of type {"fin":"kuvaus", "eng","desc"} desc_json = {} for desc in xml.xpath("//cmd:identificationInfo/cmd:description", namespaces=self.namespaces): lang = convert_language(desc.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip()) desc_json[lang] = unicode(desc.text).strip() description = json.dumps(desc_json) # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"} transl_json = {} for title in xml.xpath('//cmd:identificationInfo/cmd:resourceName', namespaces=self.namespaces): lang = convert_language(title.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip()) transl_json[lang] = title.text.strip() title = json.dumps(transl_json) provider = self.provider version = first(self._text_xpath(resource_info, "//cmd:metadataInfo/cmd:metadataLastDateUpdated/text()")) or "" coverage = first(self._text_xpath(resource_info, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:timeCoverageInfo/cmd:timeCoverage/text()")) or "" pids = [] primary_pid = '' direct_download_URL = '' access_request_URL = '' access_application_URL = '' # data_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:url/text()") for pid in [CmdiReader._language_bank_urn_pid_enhancement(metadata_pid) for metadata_pid in metadata_identifiers]: if 'urn' in pid and not primary_pid: pids.append(dict(id=pid, provider=provider, type='primary')) primary_pid=pid # else: # pids.append(dict(id=pid, provider=provider, type='relation', relation='generalRelation')) # # pids += [dict(id=CmdiReader._language_bank_urn_pid_enhancement(pid), provider=provider, type='relation', # relation='generalRelation') for pid in data_identifiers] license_identifier = CmdiReader._language_bank_license_enhancement(first(self._text_xpath(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licence/text()")) or 'notspecified') availability = CmdiReader._language_bank_availability_from_license(license_identifier) if license_identifier.lower().strip() != 'undernegotiation': if availability == 'direct_download': direct_download_URL = primary_pid if availability == 'access_request': access_request_URL = primary_pid if availability == 'access_application_other': sliced_pid = primary_pid.rsplit('/', 1) if len(sliced_pid) >= 2: access_application_URL = 'https://lbr.csc.fi/web/guest/catalogue?domain=LBR&target=basket&resource=' + sliced_pid[1] temporal_coverage_begin = "" temporal_coverage_end = "" if coverage: split = [item.strip() for item in coverage.split("-")] if len(split) == 2: temporal_coverage_begin = split[0] temporal_coverage_end = split[1] # TODO: Check agent mapping. #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorPerson") #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson") #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson") #print "###", _get_persons(resource_info, "//cmd:contactPerson") #print "###", _get_persons(resource_info, "//cmd:metadataInfo/cmd:metadataCreator") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorOrganization") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization") contacts = self._persons_as_contact(self._get_persons(resource_info, "//cmd:contactPerson")) agents = [] agents.extend(self._persons_as_agent(self._get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson"), 'author')) agents.extend(self._persons_as_agent(self._get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson"), 'owner')) agents.extend(self._organization_as_agent(self._get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization"), 'author')) agents.extend(self._organization_as_agent(self._get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization"), 'owner')) existing_package_id = get_package_id_by_pid(primary_pid, u'primary') package_id = existing_package_id if existing_package_id else get_unique_package_id() result = {'name': pid_to_name(package_id), 'language': ",".join(languages), 'pids': pids, 'version': version, 'notes': description, 'title': title, 'type': 'dataset', 'contact': contacts, 'agent': agents, 'availability': availability, 'direct_download_URL': direct_download_URL, 'access_request_URL': access_request_URL, 'access_application_URL': access_application_URL, 'temporal_coverage_begin': temporal_coverage_begin, 'temporal_coverage_end': temporal_coverage_end, 'license_id': license_identifier, 'license_URL': ''} if not languages: result['langdis'] = u'True' if package_id: result['id'] = package_id # TODO: Ask about distributionAccessMedium # _strip_first(_text_xpath(resource_info, "//cmd:distributionInfo/availability/text()")) # url = _strip_first(_text_xpath(resource_info, "//cmd:identificationInfo/cmd:url/text()")) return result
def get_distributor(data_dict): '''Get a single distributor from agent field in data_dict''' return fn.first(get_distributors(data_dict))
def read_data(self, xml): """ Extract package data from given XML. :param xml: xml element (lxml) :return: dictionary """ cmd = first( xml.xpath('//oai:record/oai:metadata/cmd:CMD', namespaces=self.namespaces)) if cmd is None: raise CmdiReaderException( "Unexpected XML format: No CMD -element found") resource_info = cmd.xpath("//cmd:Components/cmd:resourceInfo", namespaces=self.namespaces)[0] if resource_info is None: raise CmdiReaderException( "Unexpected XML format: No resourceInfo -element found") metadata_identifiers = self._text_xpath( cmd, "//cmd:identificationInfo/cmd:identifier/text()") languages = self._text_xpath( cmd, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:languageInfo/cmd:languageId/text()" ) # convert the descriptions to a JSON string of type {"fin":"kuvaus", "eng","desc"} desc_json = {} for desc in xml.xpath("//cmd:identificationInfo/cmd:description", namespaces=self.namespaces): lang = convert_language( desc.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip()) desc_json[lang] = unicode(desc.text).strip() description = json.dumps(desc_json) # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"} transl_json = {} for title in xml.xpath('//cmd:identificationInfo/cmd:resourceName', namespaces=self.namespaces): lang = convert_language( title.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip()) transl_json[lang] = title.text.strip() title = json.dumps(transl_json) provider = self.provider version = first( self._text_xpath( resource_info, "//cmd:metadataInfo/cmd:metadataLastDateUpdated/text()")) or "" coverage = first( self._text_xpath( resource_info, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:timeCoverageInfo/cmd:timeCoverage/text()" )) or "" pids = [] primary_pid = '' direct_download_URL = '' access_request_URL = '' access_application_URL = '' # data_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:url/text()") for pid in [ CmdiReader._language_bank_urn_pid_enhancement(metadata_pid) for metadata_pid in metadata_identifiers ]: if 'urn' in pid and not primary_pid: pids.append(dict(id=pid, provider=provider, type='primary')) primary_pid = pid # else: # pids.append(dict(id=pid, provider=provider, type='relation', relation='generalRelation')) # # pids += [dict(id=CmdiReader._language_bank_urn_pid_enhancement(pid), provider=provider, type='relation', # relation='generalRelation') for pid in data_identifiers] license_identifier = CmdiReader._language_bank_license_enhancement( first( self._text_xpath( resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licence/text()" )) or 'notspecified') availability = CmdiReader._language_bank_availability_from_license( license_identifier) if license_identifier.lower().strip() != 'undernegotiation': if availability == 'direct_download': direct_download_URL = primary_pid if availability == 'access_request': access_request_URL = primary_pid if availability == 'access_application_other': sliced_pid = primary_pid.rsplit('/', 1) if len(sliced_pid) >= 2: access_application_URL = 'https://lbr.csc.fi/web/guest/catalogue?domain=LBR&target=basket&resource=' + sliced_pid[ 1] temporal_coverage_begin = "" temporal_coverage_end = "" if coverage: split = [item.strip() for item in coverage.split("-")] if len(split) == 2: temporal_coverage_begin = split[0] temporal_coverage_end = split[1] # TODO: Check agent mapping. #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorPerson") #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson") #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson") #print "###", _get_persons(resource_info, "//cmd:contactPerson") #print "###", _get_persons(resource_info, "//cmd:metadataInfo/cmd:metadataCreator") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorOrganization") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization") contacts = self._persons_as_contact( self._get_persons(resource_info, "//cmd:contactPerson")) agents = [] agents.extend( self._persons_as_agent( self._get_persons( resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson"), 'author')) agents.extend( self._persons_as_agent( self._get_persons( resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson" ), 'owner')) agents.extend( self._organization_as_agent( self._get_organizations( resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization"), 'author')) agents.extend( self._organization_as_agent( self._get_organizations( resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization" ), 'owner')) existing_package_id = get_package_id_by_pid(primary_pid, u'primary') package_id = existing_package_id if existing_package_id else get_unique_package_id( ) result = { 'name': pid_to_name(package_id), 'language': ",".join(languages), 'pids': pids, 'version': version, 'notes': description, 'title': title, 'type': 'dataset', 'contact': contacts, 'agent': agents, 'availability': availability, 'direct_download_URL': direct_download_URL, 'access_request_URL': access_request_URL, 'access_application_URL': access_application_URL, 'temporal_coverage_begin': temporal_coverage_begin, 'temporal_coverage_end': temporal_coverage_end, 'license_id': license_identifier, 'license_URL': '' } if not languages: result['langdis'] = u'True' if package_id: result['id'] = package_id # TODO: Ask about distributionAccessMedium # _strip_first(_text_xpath(resource_info, "//cmd:distributionInfo/availability/text()")) # url = _strip_first(_text_xpath(resource_info, "//cmd:identificationInfo/cmd:url/text()")) return result
def _get_mime_type(self): return first([ a.string for a in self.dc('format', text=re.compile('/'), recursive=False) ]) or ''
def _read(self): project_funder, project_funding, project_name, project_homepage = _get_project_stuff( self.dc) or ('', '', '', '') # Todo! This needs to be improved to use also simple-dc # dc(filter_tag_name_namespace('publisher', ns['dc']), recursive=False) availability, license_id, license_url, access_application_url = _get_rights( self.dc) or ('', '', '', '') if not availability: availability = first(self._get_availability()) uploader = self._get_uploader() data_pids = list(_get_data_pids(self.dc)) tags = [] #for tag in sorted([a.string for a in self.dc('subject', recursive=False)]): # tags.extend(self._resolve_tags(tag)) tags = [a.string for a in self.dc('subject', recursive=False)] transl_json = {} for title in self.dc('title', recursive=False): lang = utils.convert_language(title.get('xml:lang', '').strip()) transl_json[lang] = title.string.strip() title = json.dumps(transl_json) def _get_primary_pid(data_pids): for dpid in data_pids: if dpid.startswith('urn:nbn:fi:csc-ida'): data_pids.remove(dpid) return [dpid] return [] # Create a unified internal harvester format dict unified = dict( # ?=dc('source', recursive=False), # ?=dc('relation', recursive=False), # ?=dc('type', recursive=False), access_application_URL=access_application_url or '', # Todo! Implement access_request_URL='', algorithm=first(_get_algorithm(self.dc)) or '', # TODO: Handle availabilities better availability=availability, checksum=_get_checksum(self.dc) or '', direct_download_URL=first(_get_download(self.dc)) or '', # Todo! Implement discipline='', # Todo! Should be possible to implement with QDC, but not with OAI_DC # evdescr=[], # evtype=[], # evwhen=[], # evwho=[], # Todo! Implement geographic_coverage='', #langtitle=[dict(lang=a.get('xml:lang', ''), value=a.string) for a in self.dc('title', recursive=False)], title=title, language=','.join( sorted( [a.string for a in self.dc('language', recursive=False)])), license_URL=license_url or '', license_id=license_id or 'notspecified', # Todo! Using only the first entry, for now contact=[ dict(name=name or "", email=email or "", URL=url or "", phone=phone or "") for name, email, phone, url in self._get_maintainer_stuff() ], # Todo! IDA currently doesn't produce this, maybe in future # dc('hasFormat', recursive=False) mimetype=self._get_mime_type(), notes=self._read_notes(), # Todo! Using only the first entry, for now # owner=first([a.get('resource') for a in dc('rightsHolder', recursive=False)]) or '', pids=[ dict(id=pid, provider=_get_provider(self.bs), type=u'primary') for pid in _get_primary_pid(data_pids) ] + [ dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in data_pids ] + [ dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in self._get_version_pids() ] + [ dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in _get_metadata_pid(self.dc) ], agent=[ dict(role='author', name=orgauth.get('value', ''), id='', organisation=orgauth.get('org', ''), URL='', fundingid='') for orgauth in _get_org_auth(self.dc) ] + [ dict(role='contributor', name=contributor.get('value', ''), id='', organisation=contributor.get('org', ''), URL='', fundingid='') for contributor in _get_contributor(self.dc) ] + [ dict( role='funder', name=first(project_name) or '', id=first(project_name) or '', organisation=first(project_funder) or "", URL=first(project_homepage) or '', fundingid=first(project_funding) or '', ) ] + [ dict(role='owner', name=first([ a.get('resource') for a in self.dc('rightsHolder', recursive=False) ]) or first(_get_rightsholder(self.dc)) or '', id='', organisation='', URL='', fundingid='') ], tag_string=','.join(tags) or '', # Todo! Implement if possible temporal_coverage_begin='', temporal_coverage_end='', type='dataset', uploader=uploader, # Used in smear harvest code to extract variable, station and year values, but is not used when # creating the dataset via API. smear_url=first(_get_download(self.dc, False)) or '', # Todo! This should be more exactly picked version=(self.dc.modified or self.dc.date).string if (self.dc.modified or self.dc.date) else '', # version=dc( # partial(filter_tag_name_namespace, 'modified', ns['dct']), recursive=False)[0].string or dc( # partial(filter_tag_name_namespace, 'date', ns['dc']), recursive=False)[0].string, ) if not unified['language']: unified['langdis'] = 'True' # Create id and name unified['id'] = generate_pid() unified['name'] = pid_to_name(unified['id']) # If primary pid is missing, set package id as primary pid if not any( pid.get('type', None) == u'primary' for pid in unified['pids']): unified['pids'].append( dict(id=unified['id'], type=u'primary', provider=None)) # if not unified['project_name']: # unified['projdis'] = 'True' return unified