コード例 #1
0
 def parse_temporal_coverage(self):
     """ Find time coverage of the metadata """
     tc = first(self._text_xpath(self.resource_info,
                                 "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:timeCoverageInfo/cmd"
                                 ":timeCoverage/text()")) or \
          first(self._text_xpath(self.resource_info,
                                 "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusAudioInfo/cmd:timeCoverageInfo/cmd"
                                 ":timeCoverage/text()"))
     return tc
コード例 #2
0
    def _strip_first(elements):
        """ Strip and return first element.

        :param elements: list of xml elements
        :return: first element or none
        """
        return (first(elements) or "").strip()
コード例 #3
0
    def _get_persons(cls, root, xpath):
        """ Extract person dictionary from XML using given Xpath.

        :param root: parent element (lxml) where selection is done
        :param xpath: xpath selector used to get data
        :return: list of person dictionaries
        """
        return [{
            'role':
            cls._strip_first(
                person.xpath("cmd:role/text()", namespaces=cls.namespaces)),
            'surname':
            cls._strip_first(
                person.xpath("cmd:personInfo/cmd:surname/text()",
                             namespaces=cls.namespaces)),
            'given_name':
            cls._strip_first(
                person.xpath("cmd:personInfo/cmd:givenName/text()",
                             namespaces=cls.namespaces)),
            'email':
            cls._strip_first(
                person.xpath(
                    "cmd:personInfo/cmd:communicationInfo/cmd:email/text()",
                    namespaces=cls.namespaces)),
            'organization':
            first(
                cls._get_organizations(person,
                                       "cmd:personInfo/cmd:affiliation"))
        } for person in root.xpath(xpath, namespaces=cls.namespaces)]
コード例 #4
0
def is_allowed_org_member_edit(group_dict, user_id, target_id, target_role):
    '''
    Check if the user is allowed to edit an organization member

    :param group_dict: dict of all groups (organizations)
    :param user_id: user id
    :param target_id: target user id
    :param target_role: target's current role
    '''
    target_role = getattr(target_role, 'original', target_role)

    user = fn.first(filter(lambda user: user.get('id') == user_id, group_dict['users']))

    if not user:
        return False

    user_role = user.get('capacity')
    target_role = target_role.lower()

    if user.get('sysadmin'):
        return True

    for possible_role in ['admin', 'editor', 'member']:
        if settings.ORGANIZATION_MEMBER_PERMISSIONS.get((user_role, target_role, possible_role, user_id == target_id)):
            return True

    return False
コード例 #5
0
def _get_org_auth(tag_tree):
    '''
    Returns an iterator over organization-author dicts from metadata
    '''
    def oai_dc():
        '''
        Get 'author' and 'organization' information from OAI-DC
        '''
        for c in tag_tree(_filter_tag_name_namespace(name='creator', namespace=NS['dc']), recursive=False):
            yield {'org': '', 'value': c.string}


    def ida():
        '''
        Get 'author' and 'organization' information from IDA
        '''
        for c in tag_tree(_filter_tag_name_namespace(name='contributor', namespace=NS['dct']), recursive=False):
            # Todo! Simplify this!
            if c.Person and c.Organization:
                yield {'org': c.Organization.find('name').string, 'value': c.Person.find('name').string}
            elif c.Person:
                yield {'org': '', 'value': c.Person.find('name').string}
            elif c.Organization:
                yield {'org': c.Organization.find('name').string, 'value': ''}

    return ida() if first(ida()) else oai_dc()
コード例 #6
0
    def _get_availability(self):
        """ Get availibility from description tags """
        availability = first(self.dc(_filter_tag_name_namespace(name='availability', namespace=NS['cscida']), recursive=False))
        if availability:
            return [availability.string.strip()]

        return self._get_description_values('availability')
コード例 #7
0
    def _strip_first(elements):
        """ Strip and return first element.

        :param elements: list of xml elements
        :return: first element or none
        """
        return (first(elements) or "").strip()
コード例 #8
0
def is_allowed_org_member_edit(group_dict, user_id, target_id, target_role):
    '''
    Check if the user is allowed to edit an organization member

    :param group_dict: dict of all groups (organizations)
    :param user_id: user id
    :param target_id: target user id
    :param target_role: target's current role
    '''
    target_role = getattr(target_role, 'original', target_role)

    user = fn.first(filter(lambda user: user.get('id') == user_id, group_dict['users']))

    if not user:
        return False

    user_role = user.get('capacity')
    target_role = target_role.lower()

    if user.get('sysadmin'):
        return True

    for possible_role in ['admin', 'editor', 'member']:
        if settings.ORGANIZATION_MEMBER_PERMISSIONS.get((user_role, target_role, possible_role, user_id == target_id)):
            return True

    return False
コード例 #9
0
def _get_org_auth(tag_tree):
    '''
    Returns an iterator over organization-author dicts from metadata
    '''
    def oai_dc():
        '''
        Get 'author' and 'organization' information from OAI-DC
        '''
        for c in tag_tree(_filter_tag_name_namespace(name='creator',
                                                     namespace=NS['dc']),
                          recursive=False):
            yield {'org': '', 'value': c.string}

    def ida():
        '''
        Get 'author' and 'organization' information from IDA
        '''
        for c in tag_tree(_filter_tag_name_namespace(name='contributor',
                                                     namespace=NS['dct']),
                          recursive=False):
            # Todo! Simplify this!
            if c.Person and c.Organization:
                yield {
                    'org': c.Organization.find('name').string,
                    'value': c.Person.find('name').string
                }
            elif c.Person:
                yield {'org': '', 'value': c.Person.find('name').string}
            elif c.Organization:
                yield {'org': c.Organization.find('name').string, 'value': ''}

    return ida() if first(ida()) else oai_dc()
コード例 #10
0
    def _get_uploader(self):
        '''
        Get uploader from cscida tags
        :return
        '''
        uploader = first(self.dc(_filter_tag_name_namespace(name='uploader', namespace=NS['cscida']), recursive=False))
        if uploader:
            return uploader.string.strip()

        return ''
コード例 #11
0
    def _get_mime_type(self):
        '''
        Get general.mime_type from data

        '''
        mime_type = first(self.dc(_filter_tag_name_namespace(name='general.mime_type', namespace=NS['cscida']), recursive=False))
        if mime_type:
            return mime_type.string.strip()

        return self._get_description_value('general.mime_type')
コード例 #12
0
    def _get_availability(self):
        """ Get availibility from description tags """
        availability = first(
            self.dc(_filter_tag_name_namespace(name='availability',
                                               namespace=NS['cscida']),
                    recursive=False))
        if availability:
            return [availability.string.strip()]

        return self._get_description_values('availability')
コード例 #13
0
ファイル: controllers.py プロジェクト: xbian/ckanext-kata
    def _get_contact_email(self, pkg_id, contact_id):
        recipient = None
        if contact_id:
            contacts = utils.get_package_contacts(pkg_id)
            contact = fn.first(filter(lambda c: c.get('id') == contact_id, contacts))

            if contact and 'email' in contact.keys():
                email = contact.get('email')
                name = contact.get('name')
                recipient = {'name': name, 'email': email}

        return recipient
コード例 #14
0
    def _get_contact_email(self, pkg_id, contact_id):
        recipient = None
        if contact_id:
            contacts = utils.get_package_contacts(pkg_id)
            contact = fn.first(filter(lambda c: c.get('id') == contact_id, contacts))

            if contact and 'email' in contact.keys():
                email = contact.get('email')
                name = contact.get('name')
                recipient = {'name': name, 'email': email}

        return recipient
コード例 #15
0
    def _get_persons(cls, root, xpath):
        """ Extract person dictionary from XML using given Xpath.

        :param root: parent element (lxml) where selection is done
        :param xpath: xpath selector used to get data
        :return: list of person dictionaries
        """
        return [{'role': cls._strip_first(person.xpath("cmd:role/text()", namespaces=cls.namespaces)),
                 'surname': cls._strip_first(person.xpath("cmd:personInfo/cmd:surname/text()", namespaces=cls.namespaces)),
                 'given_name': cls._strip_first(person.xpath("cmd:personInfo/cmd:givenName/text()", namespaces=cls.namespaces)),
                 'email': cls._strip_first(person.xpath("cmd:personInfo/cmd:communicationInfo/cmd:email/text()", namespaces=cls.namespaces)),
                 'organization': first(cls._get_organizations(person, "cmd:personInfo/cmd:affiliation"))}
                for person in root.xpath(xpath, namespaces=cls.namespaces)]
コード例 #16
0
    def _get_mime_type(self):
        '''
        Get general.mime_type from data

        '''
        mime_type = first(
            self.dc(_filter_tag_name_namespace(name='general.mime_type',
                                               namespace=NS['cscida']),
                    recursive=False))
        if mime_type:
            return mime_type.string.strip()

        return self._get_description_value('general.mime_type')
コード例 #17
0
    def _get_uploader(self):
        '''
        Get uploader from cscida tags
        :return
        '''
        uploader = first(
            self.dc(_filter_tag_name_namespace(name='uploader',
                                               namespace=NS['cscida']),
                    recursive=False))
        if uploader:
            return uploader.string.strip()

        return ''
コード例 #18
0
ファイル: utils.py プロジェクト: xbian/ckanext-kata
def get_member_role(group_id, user_id):
    """
    Get the user's role for this group.

    :param group_id: Group ID
    :param user_id: User ID
    :rtype: list of strings
    """
    query = model.Session.query(model.Member) \
        .filter(model.Member.group_id == group_id) \
        .filter(model.Member.table_name == 'user') \
        .filter(model.Member.state == 'active') \
        .filter(model.Member.table_id == user_id)

    return fn.first([group.capacity for group in query.all()])
コード例 #19
0
def _get_project_stuff(tag_tree):
    '''
    Get project_funder, project_funding, project_name, project_homepage

    :param tag_tree: metadata (dc) element in BeautifulSoup tree
    '''
    def ida():
        for a in tag_tree(_filter_tag_name_namespace(name='contributor', namespace=NS['dct']), recursive=False):
            if a.Project:
                funder_funding = a.Project.comment.string.split(u' rahoituspäätös ') if a.Project.comment else ('', '')
                name = a.Project.find('name').string if a.Project.find('name') else ''
                about = a.Project.get('about', '')
                yield tuple(funder_funding) + (name,) + (about,)

    return zip(*ida()) if first(ida()) else None
コード例 #20
0
ファイル: utils.py プロジェクト: atehwa/ckanext-kata
def get_member_role(group_id, user_id):
    """
    Get the user's role for this group.

    :param group_id: Group ID
    :param user_id: User ID
    :rtype: list of strings
    """
    query = model.Session.query(model.Member) \
        .filter(model.Member.group_id == group_id) \
        .filter(model.Member.table_name == 'user') \
        .filter(model.Member.state == 'active') \
        .filter(model.Member.table_id == user_id)

    return fn.first([group.capacity for group in query.all()])
コード例 #21
0
def _get_project_stuff(tag_tree):
    '''
    Get project_funder, project_funding, project_name, project_homepage

    :param tag_tree: metadata (dc) element in BeautifulSoup tree
    '''
    def ida():
        for a in tag_tree(_filter_tag_name_namespace(name='contributor',
                                                     namespace=NS['dct']),
                          recursive=False):
            if a.Project:
                funder_funding = a.Project.comment.string.split(
                    u' rahoituspäätös ') if a.Project.comment else ('', '')
                name = a.Project.find('name').string if a.Project.find(
                    'name') else ''
                about = a.Project.get('about', '')
                yield tuple(funder_funding) + (name, ) + (about, )

    return zip(*ida()) if first(ida()) else None
コード例 #22
0
    def __init__(self, xml, provider=None):
        """ Initialize the helper for parsing the given xml.

        :param xml: an lxml object, representing a CMDI record
        """
        cmd = first(xml.xpath('//oai:record/oai:metadata/cmd:CMD',
                              namespaces=CmdiParseHelper.namespaces))
        if cmd is None:
            raise CmdiParseException(
                "Unexpected XML format: No CMD -element found")

        resource_info = cmd.xpath(
            "//cmd:Components/cmd:resourceInfo", namespaces=CmdiParseHelper.namespaces)[0]
        if resource_info is None:
            raise CmdiParseException(
                "Unexpected XML format: No resourceInfo -element found")

        self.xml = xml
        self.cmd = cmd
        self.resource_info = resource_info
        self.provider = provider or config.get('ckan.site_url')
コード例 #23
0
 def parse_license(self):
     """ Find the license for the metadata """
     return first(self._text_xpath(
         self.resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licence/text()"))
コード例 #24
0
 def parse_modified(self):
     """ Find date when metadata was last modified """
     return first(self._text_xpath(self.resource_info, "//cmd:metadataInfo/cmd:metadataLastDateUpdated/text()"))
コード例 #25
0
ファイル: ddi25.py プロジェクト: CSCfi/metax-ckanext-etsin
def ddi25_mapper(xml):
    """ Convert given DDI 2.5 XML into MetaX format dict.
    :param xml: xml element (lxml)
    :return: dictionary
    """

    namespaces = {'oai': "http://www.openarchives.org/OAI/2.0/",
                  'ddi': "ddi:codebook:2_5"}

    cb = first(xml.xpath('//oai:record/oai:metadata/ddi:codeBook', namespaces=namespaces))
    stdy = cb.find('ddi:stdyDscr', namespaces)

    # Preferred identifier
    pref_id = None
    id_nos = stdy.findall('ddi:citation/ddi:titlStmt/ddi:IDNo', namespaces)
    id_no = first(filter(lambda x: x.get('agency') == 'Kansalliskirjasto', id_nos))
    if id_no is not None:
        pref_id = id_no.text

    # Title
    title = {}
    titl = stdy.findall('ddi:citation/ddi:titlStmt/ddi:titl', namespaces)
    if len(titl):
        for t in titl:
            title[get_tag_lang(t)] = t.text

    # Creator
    # Assume that 'AuthEnty' tags for different language 'citations' are in same order
    creators = []
    try:
        for i, citation in enumerate(stdy.findall('ddi:citation', namespaces)):
            for j, author in enumerate(citation.xpath(
                    'ddi:rspStmt/ddi:AuthEnty|ddi:rspStmt/ddi:othId',
                    namespaces=namespaces)):
                agent_obj = {'name': None}
                if 'affiliation' in author.keys():
                    org = author.get('affiliation')
                    if i == 0:
                        agent_obj['@type'] = 'Person'
                        if org is not None:
                            agent_obj['member_of'] = {
                                'name': {
                                    get_tag_lang(author): org},
                                '@type': 'Organization'}
                        # TODO: Check here that othIds are handled correctly
                        agent_obj['name'] = author.text.strip()
                        creators.append(agent_obj)
                    elif org is not None:
                        creators[j]['member_of']['name'][get_tag_lang(author)] = org
                else:
                    if i == 0:
                        agent_obj['@type'] = 'Organization'
                        agent_obj['name'] = {get_tag_lang(author): author.text.strip()}
                        creators.append(agent_obj)
                    else:
                        creators[j]['name'][get_tag_lang(author)] = author.text.strip()
                if author.tag.split('}')[1] == 'othId':
                    log.info('Tag "othId" found, check it is correctly parsed(TODO)!')
    except Exception as e:
        log.error('Error parsing "creators": {0}: {1}. Check that different '
                  'language elements match at the source.'.format(e.__class__.__name__, e))
        raise

    # Modified
    modified = None
    ver_stmt = stdy.find('ddi:citation/ddi:verStmt/ddi:version', namespaces)
    if ver_stmt is not None and ver_stmt.get('date'):
        modified = get_string_as_valid_datetime_string(ver_stmt.get('date'), '01-01')

    # Description
    description = {}
    try:
        for abstract in stdy.findall('ddi:stdyInfo/ddi:abstract', namespaces):
            description[get_tag_lang(abstract)] = unicode(abstract.text).strip()
    except Exception as e:
        log.error('Error parsing "description": {0}: {1}'.format(e.__class__.__name__, e))
        raise

    # Keywords
    keywords = []
    for kw in stdy.findall('ddi:stdyInfo/ddi:subject/ddi:keyword', namespaces):
        keywords.append(kw.text.strip())
    vocab = 'CESSDA Topic Classification'
    for cterm in stdy.findall("ddi:stdyInfo/ddi:subject/ddi:topcClas[@vocab='{0}']".format(vocab), namespaces):
        keywords.append(cterm.text.strip())

    # Field of science
    codes = set()
    for fos in stdy.findall("ddi:stdyInfo/ddi:subject/ddi:topcClas[@vocab='OKM']", namespaces):
        field = 'label.' + get_tag_lang(fos)
        codes.add(get_ref_data('field_of_science', field, fos.text.strip(), 'code'))
    field_of_science = [{'identifier': c} for c in codes ]
    if not len(field_of_science):
        log.debug("No 'field of science' found.")
        field_of_science.append({'identifier': 'ta5'})

    # Publisher
    publisher = {
                    'name': {},
                    '@type': 'Organization',
                    "homepage": {
                        "title": {
                            "en": "Publisher website",
                            "fi": "Julkaisijan kotisivu"},
                        "identifier": ""}
    }
    for dist in stdy.findall('ddi:citation/ddi:distStmt', namespaces):
        distr = dist.find('ddi:distrbtr', namespaces)
        publisher['name'][get_tag_lang(distr)] = distr.text.strip()
        publisher['homepage']['identifier'] = distr.get('URI')

    # Temporal coverage
    tpath = "ddi:stdyInfo/ddi:sumDscr/ddi:{tag}[@event='{ev}']"
    tstart = stdy.find(tpath.format(tag='timePrd', ev='start'), namespaces) or\
        stdy.find(tpath.format(tag='collDate', ev='start'), namespaces)
    tend = stdy.find(tpath.format(tag='timePrd', ev='end'), namespaces) or\
        stdy.find(tpath.format(tag='collDate', ev='end'), namespaces)
    if tstart is None and tend is None:
        tstart = stdy.find(tpath.format(tag='timePrd', ev='single'), namespaces) or\
                 stdy.find(tpath.format(tag='collDate', ev='single'), namespaces)
        tend = tstart
    elif tstart is None or tend is None:
        log.error('No temporal coverage or only start or end date in dataset!')

    temporal_coverage_obj_1 = {}

    if tstart is not None and tstart.get('date'):
        start_dt = get_string_as_valid_datetime_string(tstart.get('date'), '01-01', '00:00:00')
        if start_dt is None:
            temporal_coverage_obj_1['temporal_coverage'] = tstart.get('date')
            if tend is not None and tend.get('date'):
                temporal_coverage_obj_1['temporal_coverage'] += ' - ' + tend.get('date')
        else:
            temporal_coverage_obj_1['start_date'] = start_dt
            if tend is not None and tend.get('date'):
                end_dt = get_string_as_valid_datetime_string(tend.get('date'), '12-31', '23:59:59')
                if end_dt is not None:
                    temporal_coverage_obj_1['end_date'] = end_dt

    # Provenance
    universe = {}
    univ = stdy.findall("ddi:stdyInfo/ddi:sumDscr/ddi:universe", namespaces)
    for u in univ:
        universe[get_tag_lang(u)] = u.text.strip()
    provenance = [{'title': {'en': 'Collection'},
                   'description': {
                       'en': 'Contains the date(s) when the data were collected.'},
                   'variable': [{'pref_label': universe}]
                   }]
    if temporal_coverage_obj_1:
        provenance[0]['temporal'] = temporal_coverage_obj_1

    # Production
    prod = stdy.find('ddi:citation/ddi:prodStmt/ddi:prodDate', namespaces)
    if prod is not None:
        temporal_coverage_obj_2 = {}

        if prod.text:
            start_dt = get_string_as_valid_datetime_string(prod.text.strip(), '01-01', '00:00:00')
            if start_dt is None:
                temporal_coverage_obj_2['temporal_coverage'] = prod.text.strip()
            else:
                temporal_coverage_obj_2['start_date'] = start_dt
                temporal_coverage_obj_2['end_date'] = get_string_as_valid_datetime_string(prod.text.strip(), '12-31',
                                                                                      '23:59:59')
        provenance.append(
            {'title': {'en': 'Production'},
             'description': {'en': 'Date when the data collection were'
                                   ' produced (not distributed or archived)'}})
        if temporal_coverage_obj_2:
            provenance[1]['temporal'] = temporal_coverage_obj_2

    # Geographical coverage
    spatial = [{}]
    lang_attr = '{http://www.w3.org/XML/1998/namespace}lang'
    lang_path = "ddi:stdyInfo/ddi:sumDscr/ddi:nation[@{la}='{lt}']"
    nat_fi = stdy.find(lang_path.format(la=lang_attr, lt='fi'), namespaces)
    nat_en = stdy.find(lang_path.format(la=lang_attr, lt='en'), namespaces)
    if nat_en is not None:
        spatial = [{'geographic_name': nat_en.text.strip()}]
    if nat_fi is not None:
        # Assume Finland so search ES for Finnish place names: 'nat_fi'
        spat_id = get_ref_data('location', 'label.fi', nat_fi.text.strip(),
                               'code')
        if spat_id is not None:
            spatial[0]['place_uri'] = {'identifier': spat_id}
        if spatial[0].get('geographic_name') is None:
            spatial[0]['geographic_name'] = nat_fi.text.strip()

    package_dict = {
        "preferred_identifier": pref_id,
        "title": title,
        "creator": creators,
        "description": description,
        "keyword": keywords,
        "field_of_science": field_of_science,
        "publisher": publisher,
        "provenance": provenance,
        "spatial": spatial
    }

    if modified is not None:
        package_dict['modified'] = modified

    if temporal_coverage_obj_1:
        package_dict['temporal'] = [temporal_coverage_obj_1]

    return package_dict
コード例 #26
0
 def _get_description_value(self, key):
     return first(self._get_description_values(key))
コード例 #27
0
def get_funder(data_dict):
    '''Get a single funder from agent field in data_dict'''
    return fn.first(get_funders(data_dict))
コード例 #28
0
ファイル: fsd.py プロジェクト: CSCfi/metax-ckanext-etsin
def fsd_refiner(context, data_dict):
    """ Refines the given MetaX data dict in a FSD-specific way

    :param context: Dictionary with an lxml-field
    :param data_dict: Dataset dictionary in MetaX format
    """
    namespaces = {'oai': "http://www.openarchives.org/OAI/2.0/",
                  'ddi': "ddi:codebook:2_5"}
    ACCESS_RIGHTS = [{
        'match': r"The dataset is \(A\)",
        'license': 'other-open',
        'access_type': 'open'}, {
        'match': r"The dataset is \(B\)",
        'license': 'other-closed',
        'access_type': 'restricted',
        'restriction_grounds': ['education', 'research']}, {
        'match': r"The dataset is \(C\)",
        'license': 'other-closed',
        'access_type': 'restricted',
        'restriction_grounds': ['research']}, {
        'match': r"The dataset is \(D\)",
        'license': 'other-closed',
        'access_type': 'restricted'}]

    package_dict = data_dict
    xml = context.get('source_data')
    cb = first(xml.xpath('//oai:record/oai:metadata/ddi:codeBook',
                         namespaces=namespaces))

    # Language
    languages = [get_tag_lang(fn) for fn in cb.findall(
        'ddi:fileDscr/ddi:fileTxt/ddi:fileName', namespaces)]

    language_list = [{'identifier': get_language_identifier(
        convert_language(lang))} for lang in languages]

    package_dict['language'] = language_list

    # Licence and access type
    if 'access_rights' not in package_dict:
        package_dict['access_rights'] = {}
    restriction = {}
    for res in cb.findall('ddi:stdyDscr/ddi:dataAccs/ddi:useStmt/ddi:restrctn',
                          namespaces):
        restriction[get_tag_lang(res)] = res.text.strip()
    if len(restriction.get('en', '')):
        for ar in ACCESS_RIGHTS:
            if re.match(ar['match'], restriction.get('en', '')):
                package_dict['access_rights']['license'] = [{
                    'identifier': ar['license'],
                    'description': restriction}]
                package_dict['access_rights']['access_type'] = {
                    'identifier': ar['access_type']}

                restriction_grounds = []
                for rg in ar.get('restriction_grounds', []):
                    restriction_grounds.append({'identifier': rg})
                if restriction_grounds:
                    package_dict['access_rights']['restriction_grounds'] = restriction_grounds

                break
        if package_dict['access_rights'].get('license') is None:
            log.error('Unknown licence in dataset')

    conditions = {}
    for cond in cb.findall('ddi:stdyDscr/ddi:dataAccs/ddi:useStmt/ddi:conditions',
                           namespaces):
        conditions[get_tag_lang(cond)] = cond.text.strip()
    if len(conditions):
        package_dict['access_rights']['description'] = conditions

    if 'access_type' not in package_dict['access_rights']:
        package_dict['access_rights']['access_type'] = {
            'identifier': 'http://uri.suomi.fi/codelist/fairdata/access_type/code/restricted'
        }

    # Add old pid
    old_pids_path = os.path.dirname(__file__) + '/resources/fsd_pid_to_kata_urn.csv'
    set_existing_kata_identifier_to_other_identifier(
        old_pids_path, package_dict['preferred_identifier'], package_dict)

    return package_dict
コード例 #29
0
def get_distributor(data_dict):
    '''Get a single distributor from agent field in data_dict'''
    return fn.first(get_distributors(data_dict))
コード例 #30
0
def get_funder(data_dict):
    '''Get a single funder from agent field in data_dict'''
    return fn.first(get_funders(data_dict))
コード例 #31
0
    def read_data(self, xml):
        """ Extract package data from given XML.
        :param xml: xml element (lxml)
        :return: dictionary
        """
        cmd = first(xml.xpath('//oai:record/oai:metadata/cmd:CMD', namespaces=self.namespaces))
        if cmd is None:
            raise CmdiReaderException("Unexpected XML format: No CMD -element found")

        resource_info = cmd.xpath("//cmd:Components/cmd:resourceInfo", namespaces=self.namespaces)[0]
        if resource_info is None:
            raise CmdiReaderException("Unexpected XML format: No resourceInfo -element found")

        metadata_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:identifier/text()")
        data_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:url/text()")

        languages = self._text_xpath(cmd, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:languageInfo/cmd:languageId/text()")

        # convert the descriptions to a JSON string of type {"fin":"kuvaus", "eng","desc"}
        desc_json = {}
        for desc in xml.xpath("//cmd:identificationInfo/cmd:description", namespaces=self.namespaces):
            lang = utils.convert_language(desc.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip())
            desc_json[lang] = unicode(desc.text).strip()

        description = json.dumps(desc_json)

        # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"}
        transl_json = {}
        for title in xml.xpath('//cmd:identificationInfo/cmd:resourceName', namespaces=self.namespaces):
            lang = utils.convert_language(title.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip())
            transl_json[lang] = title.text.strip()

        title = json.dumps(transl_json)

        version = first(self._text_xpath(resource_info, "//cmd:metadataInfo/cmd:metadataLastDateUpdated/text()")) or ""
        coverage = first(self._text_xpath(resource_info, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:timeCoverageInfo/cmd:timeCoverage/text()")) or ""
        license_identifier = first(self._text_xpath(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licence/text()")) or 'notspecified'

        primary_pid = None
        provider = self.provider

        pids = []
        for pid in [dict(id=pid, provider=provider, type='metadata') for pid in metadata_identifiers]:
            if 'urn' in pid.get('id', ""):
                primary_pid = pid['id']
            else:
                pids.append(pid)

        pids += [dict(id=pid, provider=provider, type='data', primary=data_identifiers.index(pid) == 0) for pid in data_identifiers]

        temporal_coverage_begin = ""
        temporal_coverage_end = ""

        if coverage:
            split = [item.strip() for item in coverage.split("-")]
            if len(split) == 2:
                temporal_coverage_begin = split[0]
                temporal_coverage_end = split[1]

        # TODO: Check agent mapping.
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorPerson")
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson")
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson")
        #print "###", _get_persons(resource_info, "//cmd:contactPerson")
        #print "###", _get_persons(resource_info, "//cmd:metadataInfo/cmd:metadataCreator")

        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorOrganization")
        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization")
        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization")

        contacts = self._persons_as_contact(self._get_persons(resource_info, "//cmd:contactPerson"))

        agents = []
        agents.extend(self._persons_as_agent(self._get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson"), 'author'))
        agents.extend(self._persons_as_agent(self._get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson"), 'owner'))

        agents.extend(self._organization_as_agent(self._get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization"), 'author'))
        agents.extend(self._organization_as_agent(self._get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization"), 'owner'))

        result = {'name': self._to_name(primary_pid or first(metadata_identifiers)),
                  'language': ",".join(languages),
                  'pids': pids,
                  'version': version,
                  'notes': description,
                  #'langtitle': titles,
                  'title': title,
                  'type': 'dataset',
                  'contact': contacts,
                  'agent': agents,
                  'availability': 'contact_owner',
                  'temporal_coverage_begin': temporal_coverage_begin,
                  'temporal_coverage_end': temporal_coverage_end,
                  'license_id': license_identifier}

        if not languages:
            result['langdis'] = u'True'

        if primary_pid:
            result['id'] = primary_pid

        # TODO: Ask about distributionAccessMedium
        # _strip_first(_text_xpath(resource_info, "//cmd:distributionInfo/availability/text()"))
        # url = _strip_first(_text_xpath(resource_info, "//cmd:identificationInfo/cmd:url/text()"))
        download_location = first(self._text_xpath(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:downloadLocation/text()"))

        if download_location:
            result['through_provider_URL'] = download_location
            result['availability'] = 'through_provider'

        return result
コード例 #32
0
    def _read(self):
        project_funder, project_funding, project_name, project_homepage = _get_project_stuff(self.dc) or ('', '', '', '')

        # Todo! This needs to be improved to use also simple-dc
        # dc(filter_tag_name_namespace('publisher', ns['dc']), recursive=False)
        availability, license_id, license_url, access_application_url = _get_rights(self.dc) or ('', '', '', '')
        if not availability:
            availability = first(self._get_availability())

        uploader = self._get_uploader()

        data_pids = list(_get_data_pids(self.dc))

        tags = []
        #for tag in sorted([a.string for a in self.dc('subject', recursive=False)]):
        #    tags.extend(self._resolve_tags(tag))
        tags = [a.string for a in self.dc('subject', recursive=False)]

        transl_json = {}
        for title in self.dc('title', recursive=False):
            lang = utils.convert_language(title.get('xml:lang', '').strip())
            transl_json[lang] = title.string.strip()

        title = json.dumps(transl_json)

        def _get_primary_pid(data_pids):
            for dpid in data_pids:
                if dpid.startswith('urn:nbn:fi:csc-ida'):
                    data_pids.remove(dpid)
                    return [dpid]
            return []

        # Create a unified internal harvester format dict
        unified = dict(
            # ?=dc('source', recursive=False),
            # ?=dc('relation', recursive=False),
            # ?=dc('type', recursive=False),

            access_application_URL=access_application_url or '',

            # Todo! Implement
            access_request_URL='',

            algorithm=first(_get_algorithm(self.dc)) or '',

            # TODO: Handle availabilities better
            availability=availability,

            checksum=_get_checksum(self.dc) or '',

            direct_download_URL=first(_get_download(self.dc)) or '',

            # Todo! Implement
            discipline='',

            # Todo! Should be possible to implement with QDC, but not with OAI_DC
            # evdescr=[],
            # evtype=[],
            # evwhen=[],
            # evwho=[],

            # Todo! Implement
            geographic_coverage='',

            #langtitle=[dict(lang=a.get('xml:lang', ''), value=a.string) for a in self.dc('title', recursive=False)],

            title=title,

            language=','.join(sorted([a.string for a in self.dc('language', recursive=False)])),

            license_URL=license_url or '',
            license_id=license_id or 'notspecified',

            # Todo! Using only the first entry, for now
            contact=[dict(name=name or "", email=email or "", URL=url or "", phone=phone or "")
                     for name, email, phone, url in self._get_maintainer_stuff()],

            # Todo! IDA currently doesn't produce this, maybe in future
            # dc('hasFormat', recursive=False)
            mimetype=self._get_mime_type(),

            notes=self._read_notes(),

            # Todo! Using only the first entry, for now
            # owner=first([a.get('resource') for a in dc('rightsHolder', recursive=False)]) or '',

            pids=[dict(id=pid, provider=_get_provider(self.bs), type=u'primary') for pid in _get_primary_pid(data_pids)] +
                 [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in data_pids] +
                 [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in self._get_version_pids()] +
                 [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in _get_metadata_pid(self.dc)],

            agent=[dict(role='author', name=orgauth.get('value', ''), id='', organisation=orgauth.get('org', ''), URL='', fundingid='') for orgauth in _get_org_auth(self.dc)] +
                  [dict(role='contributor', name=contributor.get('value', ''), id='', organisation=contributor.get('org', ''), URL='', fundingid='') for contributor in _get_contributor(self.dc)] +
                  [dict(role='funder', name=first(project_name) or '', id=first(project_name) or '', organisation=first(project_funder) or "", URL=first(project_homepage) or '', fundingid=first(project_funding) or '',)] +
                  [dict(role='owner', name=first([a.get('resource') for a in self.dc('rightsHolder', recursive=False)]) or first(_get_rightsholder(self.dc)) or '', id='', organisation='', URL='', fundingid='')],

            tag_string=','.join(tags) or '',

            # Todo! Implement if possible
            temporal_coverage_begin='',
            temporal_coverage_end='',

            type='dataset',
            uploader=uploader,

            # Used in smear harvest code to extract variable, station and year values, but is not used when
            # creating the dataset via API.
            smear_url=first(_get_download(self.dc, False)) or '',

            # Todo! This should be more exactly picked
            version=(self.dc.modified or self.dc.date).string if (self.dc.modified or self.dc.date) else '',
            # version=dc(
            #     partial(filter_tag_name_namespace, 'modified', ns['dct']), recursive=False)[0].string or dc(
            #         partial(filter_tag_name_namespace, 'date', ns['dc']), recursive=False)[0].string,

        )
        if not unified['language']:
            unified['langdis'] = 'True'

        # Create id and name
        unified['id'] = generate_pid()
        unified['name'] = pid_to_name(unified['id'])

        # If primary pid is missing, set package id as primary pid
        if not any(pid.get('type', None) == u'primary' for pid in unified['pids']):
            unified['pids'].append(dict(id=unified['id'], type=u'primary', provider=None))

        # if not unified['project_name']:
        #    unified['projdis'] = 'True'
        return unified
コード例 #33
0
 def _get_mime_type(self):
     return first([a.string for a in self.dc('format', text=re.compile('/'), recursive=False)]) or ''
コード例 #34
0
 def _get_description_value(self, key):
     return first(self._get_description_values(key))
コード例 #35
0
    def read_data(self, xml):
        """ Extract package data from given XML.
        :param xml: xml element (lxml)
        :return: dictionary
        """
        cmd = first(xml.xpath('//oai:record/oai:metadata/cmd:CMD', namespaces=self.namespaces))
        if cmd is None:
            raise CmdiReaderException("Unexpected XML format: No CMD -element found")

        resource_info = cmd.xpath("//cmd:Components/cmd:resourceInfo", namespaces=self.namespaces)[0]
        if resource_info is None:
            raise CmdiReaderException("Unexpected XML format: No resourceInfo -element found")

        metadata_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:identifier/text()")

        languages = self._text_xpath(cmd, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:languageInfo/cmd:languageId/text()")

        # convert the descriptions to a JSON string of type {"fin":"kuvaus", "eng","desc"}
        desc_json = {}
        for desc in xml.xpath("//cmd:identificationInfo/cmd:description", namespaces=self.namespaces):
            lang = convert_language(desc.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip())
            desc_json[lang] = unicode(desc.text).strip()

        description = json.dumps(desc_json)

        # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"}
        transl_json = {}
        for title in xml.xpath('//cmd:identificationInfo/cmd:resourceName', namespaces=self.namespaces):
            lang = convert_language(title.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip())
            transl_json[lang] = title.text.strip()

        title = json.dumps(transl_json)
        provider = self.provider
        version = first(self._text_xpath(resource_info, "//cmd:metadataInfo/cmd:metadataLastDateUpdated/text()")) or ""
        coverage = first(self._text_xpath(resource_info, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:timeCoverageInfo/cmd:timeCoverage/text()")) or ""

        pids = []
        primary_pid = ''
        direct_download_URL = ''
        access_request_URL = ''
        access_application_URL = ''

        # data_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:url/text()")

        for pid in [CmdiReader._language_bank_urn_pid_enhancement(metadata_pid) for metadata_pid in metadata_identifiers]:
            if 'urn' in pid and not primary_pid:
                pids.append(dict(id=pid, provider=provider, type='primary'))
                primary_pid=pid
        #     else:
        #         pids.append(dict(id=pid, provider=provider, type='relation', relation='generalRelation'))
        #
        # pids += [dict(id=CmdiReader._language_bank_urn_pid_enhancement(pid), provider=provider, type='relation',
        #               relation='generalRelation') for pid in data_identifiers]

        license_identifier = CmdiReader._language_bank_license_enhancement(first(self._text_xpath(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licence/text()")) or 'notspecified')
        availability = CmdiReader._language_bank_availability_from_license(license_identifier)

        if license_identifier.lower().strip() != 'undernegotiation':
            if availability == 'direct_download':
                direct_download_URL = primary_pid
            if availability == 'access_request':
                access_request_URL = primary_pid
            if availability == 'access_application_other':
                sliced_pid = primary_pid.rsplit('/', 1)
                if len(sliced_pid) >= 2:
                    access_application_URL = 'https://lbr.csc.fi/web/guest/catalogue?domain=LBR&target=basket&resource=' + sliced_pid[1]

        temporal_coverage_begin = ""
        temporal_coverage_end = ""

        if coverage:
            split = [item.strip() for item in coverage.split("-")]
            if len(split) == 2:
                temporal_coverage_begin = split[0]
                temporal_coverage_end = split[1]

        # TODO: Check agent mapping.
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorPerson")
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson")
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson")
        #print "###", _get_persons(resource_info, "//cmd:contactPerson")
        #print "###", _get_persons(resource_info, "//cmd:metadataInfo/cmd:metadataCreator")

        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorOrganization")
        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization")
        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization")

        contacts = self._persons_as_contact(self._get_persons(resource_info, "//cmd:contactPerson"))

        agents = []
        agents.extend(self._persons_as_agent(self._get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson"), 'author'))
        agents.extend(self._persons_as_agent(self._get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson"), 'owner'))

        agents.extend(self._organization_as_agent(self._get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization"), 'author'))
        agents.extend(self._organization_as_agent(self._get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization"), 'owner'))

        existing_package_id = get_package_id_by_pid(primary_pid, u'primary')
        package_id = existing_package_id if existing_package_id else get_unique_package_id()

        result = {'name': pid_to_name(package_id),
                  'language': ",".join(languages),
                  'pids': pids,
                  'version': version,
                  'notes': description,
                  'title': title,
                  'type': 'dataset',
                  'contact': contacts,
                  'agent': agents,
                  'availability': availability,
                  'direct_download_URL': direct_download_URL,
                  'access_request_URL': access_request_URL,
                  'access_application_URL': access_application_URL,
                  'temporal_coverage_begin': temporal_coverage_begin,
                  'temporal_coverage_end': temporal_coverage_end,
                  'license_id': license_identifier,
                  'license_URL': ''}

        if not languages:
            result['langdis'] = u'True'

        if package_id:
            result['id'] = package_id

        # TODO: Ask about distributionAccessMedium
        # _strip_first(_text_xpath(resource_info, "//cmd:distributionInfo/availability/text()"))
        # url = _strip_first(_text_xpath(resource_info, "//cmd:identificationInfo/cmd:url/text()"))

        return result
コード例 #36
0
def get_distributor(data_dict):
    '''Get a single distributor from agent field in data_dict'''
    return fn.first(get_distributors(data_dict))
コード例 #37
0
    def read_data(self, xml):
        """ Extract package data from given XML.
        :param xml: xml element (lxml)
        :return: dictionary
        """
        cmd = first(
            xml.xpath('//oai:record/oai:metadata/cmd:CMD',
                      namespaces=self.namespaces))
        if cmd is None:
            raise CmdiReaderException(
                "Unexpected XML format: No CMD -element found")

        resource_info = cmd.xpath("//cmd:Components/cmd:resourceInfo",
                                  namespaces=self.namespaces)[0]
        if resource_info is None:
            raise CmdiReaderException(
                "Unexpected XML format: No resourceInfo -element found")

        metadata_identifiers = self._text_xpath(
            cmd, "//cmd:identificationInfo/cmd:identifier/text()")

        languages = self._text_xpath(
            cmd,
            "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:languageInfo/cmd:languageId/text()"
        )

        # convert the descriptions to a JSON string of type {"fin":"kuvaus", "eng","desc"}
        desc_json = {}
        for desc in xml.xpath("//cmd:identificationInfo/cmd:description",
                              namespaces=self.namespaces):
            lang = convert_language(
                desc.get('{http://www.w3.org/XML/1998/namespace}lang',
                         'undefined').strip())
            desc_json[lang] = unicode(desc.text).strip()

        description = json.dumps(desc_json)

        # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"}
        transl_json = {}
        for title in xml.xpath('//cmd:identificationInfo/cmd:resourceName',
                               namespaces=self.namespaces):
            lang = convert_language(
                title.get('{http://www.w3.org/XML/1998/namespace}lang',
                          'undefined').strip())
            transl_json[lang] = title.text.strip()

        title = json.dumps(transl_json)
        provider = self.provider
        version = first(
            self._text_xpath(
                resource_info,
                "//cmd:metadataInfo/cmd:metadataLastDateUpdated/text()")) or ""
        coverage = first(
            self._text_xpath(
                resource_info,
                "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:timeCoverageInfo/cmd:timeCoverage/text()"
            )) or ""

        pids = []
        primary_pid = ''
        direct_download_URL = ''
        access_request_URL = ''
        access_application_URL = ''

        # data_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:url/text()")

        for pid in [
                CmdiReader._language_bank_urn_pid_enhancement(metadata_pid)
                for metadata_pid in metadata_identifiers
        ]:
            if 'urn' in pid and not primary_pid:
                pids.append(dict(id=pid, provider=provider, type='primary'))
                primary_pid = pid
        #     else:
        #         pids.append(dict(id=pid, provider=provider, type='relation', relation='generalRelation'))
        #
        # pids += [dict(id=CmdiReader._language_bank_urn_pid_enhancement(pid), provider=provider, type='relation',
        #               relation='generalRelation') for pid in data_identifiers]

        license_identifier = CmdiReader._language_bank_license_enhancement(
            first(
                self._text_xpath(
                    resource_info,
                    "//cmd:distributionInfo/cmd:licenceInfo/cmd:licence/text()"
                )) or 'notspecified')
        availability = CmdiReader._language_bank_availability_from_license(
            license_identifier)

        if license_identifier.lower().strip() != 'undernegotiation':
            if availability == 'direct_download':
                direct_download_URL = primary_pid
            if availability == 'access_request':
                access_request_URL = primary_pid
            if availability == 'access_application_other':
                sliced_pid = primary_pid.rsplit('/', 1)
                if len(sliced_pid) >= 2:
                    access_application_URL = 'https://lbr.csc.fi/web/guest/catalogue?domain=LBR&target=basket&resource=' + sliced_pid[
                        1]

        temporal_coverage_begin = ""
        temporal_coverage_end = ""

        if coverage:
            split = [item.strip() for item in coverage.split("-")]
            if len(split) == 2:
                temporal_coverage_begin = split[0]
                temporal_coverage_end = split[1]

        # TODO: Check agent mapping.
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorPerson")
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson")
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson")
        #print "###", _get_persons(resource_info, "//cmd:contactPerson")
        #print "###", _get_persons(resource_info, "//cmd:metadataInfo/cmd:metadataCreator")

        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorOrganization")
        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization")
        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization")

        contacts = self._persons_as_contact(
            self._get_persons(resource_info, "//cmd:contactPerson"))

        agents = []
        agents.extend(
            self._persons_as_agent(
                self._get_persons(
                    resource_info,
                    "//cmd:distributionInfo/cmd:iprHolderPerson"), 'author'))
        agents.extend(
            self._persons_as_agent(
                self._get_persons(
                    resource_info,
                    "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson"
                ), 'owner'))

        agents.extend(
            self._organization_as_agent(
                self._get_organizations(
                    resource_info,
                    "//cmd:distributionInfo/cmd:iprHolderOrganization"),
                'author'))
        agents.extend(
            self._organization_as_agent(
                self._get_organizations(
                    resource_info,
                    "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization"
                ), 'owner'))

        existing_package_id = get_package_id_by_pid(primary_pid, u'primary')
        package_id = existing_package_id if existing_package_id else get_unique_package_id(
        )

        result = {
            'name': pid_to_name(package_id),
            'language': ",".join(languages),
            'pids': pids,
            'version': version,
            'notes': description,
            'title': title,
            'type': 'dataset',
            'contact': contacts,
            'agent': agents,
            'availability': availability,
            'direct_download_URL': direct_download_URL,
            'access_request_URL': access_request_URL,
            'access_application_URL': access_application_URL,
            'temporal_coverage_begin': temporal_coverage_begin,
            'temporal_coverage_end': temporal_coverage_end,
            'license_id': license_identifier,
            'license_URL': ''
        }

        if not languages:
            result['langdis'] = u'True'

        if package_id:
            result['id'] = package_id

        # TODO: Ask about distributionAccessMedium
        # _strip_first(_text_xpath(resource_info, "//cmd:distributionInfo/availability/text()"))
        # url = _strip_first(_text_xpath(resource_info, "//cmd:identificationInfo/cmd:url/text()"))

        return result
コード例 #38
0
 def _get_mime_type(self):
     return first([
         a.string
         for a in self.dc('format', text=re.compile('/'), recursive=False)
     ]) or ''
コード例 #39
0
    def _read(self):
        project_funder, project_funding, project_name, project_homepage = _get_project_stuff(
            self.dc) or ('', '', '', '')

        # Todo! This needs to be improved to use also simple-dc
        # dc(filter_tag_name_namespace('publisher', ns['dc']), recursive=False)
        availability, license_id, license_url, access_application_url = _get_rights(
            self.dc) or ('', '', '', '')
        if not availability:
            availability = first(self._get_availability())

        uploader = self._get_uploader()

        data_pids = list(_get_data_pids(self.dc))

        tags = []
        #for tag in sorted([a.string for a in self.dc('subject', recursive=False)]):
        #    tags.extend(self._resolve_tags(tag))
        tags = [a.string for a in self.dc('subject', recursive=False)]

        transl_json = {}
        for title in self.dc('title', recursive=False):
            lang = utils.convert_language(title.get('xml:lang', '').strip())
            transl_json[lang] = title.string.strip()

        title = json.dumps(transl_json)

        def _get_primary_pid(data_pids):
            for dpid in data_pids:
                if dpid.startswith('urn:nbn:fi:csc-ida'):
                    data_pids.remove(dpid)
                    return [dpid]
            return []

        # Create a unified internal harvester format dict
        unified = dict(
            # ?=dc('source', recursive=False),
            # ?=dc('relation', recursive=False),
            # ?=dc('type', recursive=False),
            access_application_URL=access_application_url or '',

            # Todo! Implement
            access_request_URL='',
            algorithm=first(_get_algorithm(self.dc)) or '',

            # TODO: Handle availabilities better
            availability=availability,
            checksum=_get_checksum(self.dc) or '',
            direct_download_URL=first(_get_download(self.dc)) or '',

            # Todo! Implement
            discipline='',

            # Todo! Should be possible to implement with QDC, but not with OAI_DC
            # evdescr=[],
            # evtype=[],
            # evwhen=[],
            # evwho=[],

            # Todo! Implement
            geographic_coverage='',

            #langtitle=[dict(lang=a.get('xml:lang', ''), value=a.string) for a in self.dc('title', recursive=False)],
            title=title,
            language=','.join(
                sorted(
                    [a.string for a in self.dc('language', recursive=False)])),
            license_URL=license_url or '',
            license_id=license_id or 'notspecified',

            # Todo! Using only the first entry, for now
            contact=[
                dict(name=name or "",
                     email=email or "",
                     URL=url or "",
                     phone=phone or "")
                for name, email, phone, url in self._get_maintainer_stuff()
            ],

            # Todo! IDA currently doesn't produce this, maybe in future
            # dc('hasFormat', recursive=False)
            mimetype=self._get_mime_type(),
            notes=self._read_notes(),

            # Todo! Using only the first entry, for now
            # owner=first([a.get('resource') for a in dc('rightsHolder', recursive=False)]) or '',
            pids=[
                dict(id=pid, provider=_get_provider(self.bs), type=u'primary')
                for pid in _get_primary_pid(data_pids)
            ] + [
                dict(id=pid,
                     provider=_get_provider(self.bs),
                     type=u'relation',
                     relation=u'generalRelation') for pid in data_pids
            ] + [
                dict(id=pid,
                     provider=_get_provider(self.bs),
                     type=u'relation',
                     relation=u'generalRelation')
                for pid in self._get_version_pids()
            ] + [
                dict(id=pid,
                     provider=_get_provider(self.bs),
                     type=u'relation',
                     relation=u'generalRelation')
                for pid in _get_metadata_pid(self.dc)
            ],
            agent=[
                dict(role='author',
                     name=orgauth.get('value', ''),
                     id='',
                     organisation=orgauth.get('org', ''),
                     URL='',
                     fundingid='') for orgauth in _get_org_auth(self.dc)
            ] + [
                dict(role='contributor',
                     name=contributor.get('value', ''),
                     id='',
                     organisation=contributor.get('org', ''),
                     URL='',
                     fundingid='') for contributor in _get_contributor(self.dc)
            ] + [
                dict(
                    role='funder',
                    name=first(project_name) or '',
                    id=first(project_name) or '',
                    organisation=first(project_funder) or "",
                    URL=first(project_homepage) or '',
                    fundingid=first(project_funding) or '',
                )
            ] + [
                dict(role='owner',
                     name=first([
                         a.get('resource')
                         for a in self.dc('rightsHolder', recursive=False)
                     ]) or first(_get_rightsholder(self.dc)) or '',
                     id='',
                     organisation='',
                     URL='',
                     fundingid='')
            ],
            tag_string=','.join(tags) or '',

            # Todo! Implement if possible
            temporal_coverage_begin='',
            temporal_coverage_end='',
            type='dataset',
            uploader=uploader,

            # Used in smear harvest code to extract variable, station and year values, but is not used when
            # creating the dataset via API.
            smear_url=first(_get_download(self.dc, False)) or '',

            # Todo! This should be more exactly picked
            version=(self.dc.modified or self.dc.date).string if
            (self.dc.modified or self.dc.date) else '',
            # version=dc(
            #     partial(filter_tag_name_namespace, 'modified', ns['dct']), recursive=False)[0].string or dc(
            #         partial(filter_tag_name_namespace, 'date', ns['dc']), recursive=False)[0].string,
        )
        if not unified['language']:
            unified['langdis'] = 'True'

        # Create id and name
        unified['id'] = generate_pid()
        unified['name'] = pid_to_name(unified['id'])

        # If primary pid is missing, set package id as primary pid
        if not any(
                pid.get('type', None) == u'primary'
                for pid in unified['pids']):
            unified['pids'].append(
                dict(id=unified['id'], type=u'primary', provider=None))

        # if not unified['project_name']:
        #    unified['projdis'] = 'True'
        return unified