def test_records(self):
        """ Test record fetching via http-request to prevent accidental changes to interface """
        model.User(name="test", sysadmin=True).save()
        organization = get_action('organization_create')({'user': '******'}, {'name': 'test-organization', 'title': "Test organization"})
        package_1_data = deepcopy(TEST_DATADICT)
        package_1_data['owner_org'] = organization['name']
        package_1_data['private'] = False
        package_2_data = deepcopy(package_1_data)

        for pid in package_1_data.get('pids', []):
            pid['id'] = utils.generate_pid()
        for pid in package_2_data.get('pids', []):
            pid['id'] = utils.generate_pid()

        packages = [get_action('package_create')({'user': '******'}, package_1_data),
                    get_action('package_create')({'user': '******'}, package_2_data)]

        url = url_for('/oai')
        result = self.app.get(url, {'verb': 'ListSets'})

        root = lxml.etree.fromstring(result.body)
        request_set = self._get_single_result(root, "//o:set")

        set_name = request_set.xpath("string(o:setName)", namespaces=self._namespaces)
        set_spec = request_set.xpath("string(o:setSpec)", namespaces=self._namespaces)
        self.assertEquals(organization['name'], set_spec)
        self.assertEquals(organization['title'], set_name)

        result = self.app.get(url, {'verb': 'ListIdentifiers', 'set': set_spec, 'metadataPrefix': 'oai_dc'})

        root = lxml.etree.fromstring(result.body)
        fail = True

        package_identifiers = [package['id'] for package in packages]
        package_org_names = [Group.get(package['owner_org']).name for package in packages]

        for header in root.xpath("//o:header", namespaces=self._namespaces):
            fail = False
            set_spec = header.xpath("string(o:setSpec)", namespaces=self._namespaces)
            identifier = header.xpath("string(o:identifier)", namespaces=self._namespaces)
            self.assertTrue(set_spec in package_org_names)
            self.assertTrue(identifier in package_identifiers)

            result = self.app.get(url, {'verb': 'GetRecord', 'identifier': identifier, 'metadataPrefix': 'oai_dc'})

            root = lxml.etree.fromstring(result.body)

            fail_record = True
            for record_result in root.xpath("//o:record", namespaces=self._namespaces):
                fail_record = False
                header = self._get_single_result(record_result, 'o:header')
                self._get_single_result(record_result, 'o:metadata')

                self.assertTrue(header.xpath("string(o:identifier)", namespaces=self._namespaces) in package_identifiers)
                self.assertTrue(header.xpath("string(o:setSpec)", namespaces=self._namespaces) in package_org_names)

            self.assertFalse(fail_record, "No records received")

        self.assertFalse(fail, "No headers (packages) received")
    def test_coverage_temporal_rdf(self):
        """ For some reason _get_results(... "...*") finds temporal nodes four times.
        """
        organization = get_action('organization_create')({'user': '******'}, {'name': 'test-organization-coverage-rdf2', 'title': "Test organization rdf 2"})
        package_1_data = deepcopy(TEST_DATADICT)
        package_1_data['owner_org'] = organization['name']
        package_1_data['private'] = False
        for pid in package_1_data.get('pids', []):
            pid['id'] = utils.generate_pid()

        package = get_action('package_create')({'user': '******'}, package_1_data)
        package_name = package['name']
        url = url_for('/oai')
        result = self.app.get(url, {'verb': 'GetRecord', 'identifier': package_name, 'metadataPrefix': 'rdf'})

        root = lxml.etree.fromstring(result.body)
        expected = ['2003-07-10T06:36:27-12:00', '2010-04-15T03:24:47+12:45']

        found = 0
        for temporal in self._get_results(root, "//dct:temporal/dct:PeriodOfTime/*"):
            self.assertTrue(temporal.text in expected)
            found += 1
        self.assertEquals(4, found, "Unexpected coverage results: {f}".format(f=found))

        get_action('organization_delete')({'user': '******'}, {'id': organization['id']})
Beispiel #3
0
def update_pid(key, data, errors, context):
    '''
    Replace an empty unicode string with random PID.
    '''
    if type(data[key]) == unicode:
        if len(data[key]) == 0:
            data[key] = utils.generate_pid()
    def test_private_record(self):
        '''
        Test that private packages are not listed but public packages are

        '''
        package_1_data = deepcopy(TEST_DATADICT)
        model.User(name="privateuser", sysadmin=True).save()
        organization = get_action('organization_create')({'user': '******'}, {'name': 'private-organization', 'title': "Private organization"})
        package_1_data['private'] = True
        package_1_data['owner_org'] = organization['name']
        package_1_data['name'] = 'private-package'
        for pid in package_1_data.get('pids', []):
            pid['id'] = utils.generate_pid()
        package1 = get_action('package_create')({'user': '******'}, package_1_data)
        package_2_data = deepcopy(TEST_DATADICT)
        package_2_data['private'] = False
        package_2_data['owner_org'] = organization['name']
        package_2_data['name'] = 'public-package'
        for pid in package_2_data.get('pids', []):
            pid['id'] = utils.generate_pid()

        url = url_for('/oai')

        result = self.app.get(url, {'verb': 'ListIdentifiers', 'set': 'private-organization', 'metadataPrefix': 'oai_dc'})
        root = lxml.etree.fromstring(result.body)
        self.assertFalse(root.xpath("//o:header", namespaces=self._namespaces))

        now = datetime.datetime.isoformat(datetime.datetime.today())
        result = self.app.get(url, {'verb': 'ListRecords', 'set': 'private-organization', 'metadataPrefix': 'rdf', 'until': now})
        root = lxml.etree.fromstring(result.body)
        self.assertFalse(root.xpath("//o:header", namespaces=self._namespaces))

        package2 = get_action('package_create')({'user': '******'}, package_2_data)
        result = self.app.get(url, {'verb': 'ListIdentifiers', 'set': 'private-organization', 'metadataPrefix': 'oai_dc'})
        root = lxml.etree.fromstring(result.body)
        for header in root.xpath("//o:header", namespaces=self._namespaces):
            identifier = header.xpath("string(o:identifier)", namespaces=self._namespaces)
            print identifier
            self.assertTrue(identifier == package2['id'])

        result = self.app.get(url, {'verb': 'ListRecords', 'metadataPrefix': 'rdf'})
        root = lxml.etree.fromstring(result.body)
        for header in root.xpath("//o:header", namespaces=self._namespaces):
            identifier = header.xpath("string(o:identifier)", namespaces=self._namespaces)
            self.assertTrue(identifier == package2['id'])

        get_action('organization_delete')({'user': '******'}, {'id': organization['id']})
Beispiel #5
0
def _handle_pids(context, data_dict):
    '''
    Do some PID modifications to data_dict
    '''
    if not 'pids' in data_dict:
        data_dict['pids'] = []
    else:
        # Clean up empty PIDs
        non_empty = []

        for pid in data_dict['pids']:
            if pid.get('id'):
                non_empty.append(pid)

        data_dict['pids'] = non_empty

    if data_dict.get('generate_version_pid') == 'on':
        data_dict['pids'] += [{'id': utils.generate_pid(),
                               'type': 'version',
                               'provider': 'Etsin',
                               }]

    # If no primary data PID, generate one if this is a new dataset
    if not utils.get_pids_by_type('data', data_dict, primary=True):
        model = context["model"]
        session = context["session"]

        if data_dict.get('id'):
            query = session.query(model.Package.id).filter_by(name=data_dict['id'])  # id contains name !
            result = query.first()

            if result:
                return  # Existing dataset, don't generate new data PID

        data_dict['pids'].insert(0, {'id': utils.generate_pid(),
                                     'type': 'data',
                                     'primary': 'True',
                                     'provider': 'Etsin',
                                     })
    def test_coverage_temporal_rdf(self):
        """ For some reason _get_results(... "...*") finds temporal nodes four times.
        """
        organization = get_action('organization_create')(
            {
                'user': '******'
            }, {
                'name': 'test-organization-coverage-rdf2',
                'title': "Test organization rdf 2"
            })
        package_1_data = deepcopy(TEST_DATADICT)
        package_1_data['owner_org'] = organization['name']
        package_1_data['private'] = False
        for pid in package_1_data.get('pids', []):
            pid['id'] = utils.generate_pid()

        package = get_action('package_create')({
            'user': '******'
        }, package_1_data)
        package_name = package['name']
        url = url_for('/oai')
        result = self.app.get(
            url, {
                'verb': 'GetRecord',
                'identifier': package_name,
                'metadataPrefix': 'rdf'
            })

        root = lxml.etree.fromstring(result.body)
        expected = ['2003-07-10T06:36:27-12:00', '2010-04-15T03:24:47+12:45']

        found = 0
        for temporal in self._get_results(root,
                                          "//dct:temporal/dct:PeriodOfTime/*"):
            self.assertTrue(temporal.text in expected)
            found += 1
        self.assertEquals(4, found,
                          "Unexpected coverage results: {f}".format(f=found))

        get_action('organization_delete')({
            'user': '******'
        }, {
            'id': organization['id']
        })
    def test_coverage(self):
        model.User(name="test_coverage", sysadmin=True).save()
        organization = get_action('organization_create')(
            {
                'user': '******'
            }, {
                'name': 'test-organization-coverage',
                'title': "Test organization"
            })
        package_1_data = deepcopy(TEST_DATADICT)
        package_1_data['owner_org'] = organization['name']
        package_1_data['private'] = False
        for pid in package_1_data.get('pids', []):
            pid['id'] = utils.generate_pid()

        package = get_action('package_create')({
            'user': '******'
        }, package_1_data)
        package_name = package['name']
        url = url_for('/oai')
        result = self.app.get(
            url, {
                'verb': 'GetRecord',
                'identifier': package_name,
                'metadataPrefix': 'oai_dc'
            })

        root = lxml.etree.fromstring(result.body)
        expected = [
            'Keilaniemi (populated place)', 'Espoo (city)',
            '2003-07-10T06:36:27-12:00/2010-04-15T03:24:47+12:45'
        ]

        found = 0
        for coverage in self._get_results(root, "//dc:coverage"):
            self.assertTrue(coverage.text in expected)
            found += 1
        self.assertEquals(3, found, "Unexpected coverage results")

        get_action('organization_delete')({
            'user': '******'
        }, {
            'id': organization['id']
        })
    def test_coverage_spatial_rdf(self):
        organization = get_action('organization_create')(
            {
                'user': '******'
            }, {
                'name': 'test-organization-coverage-rdf',
                'title': "Test organization rdf"
            })
        package_1_data = deepcopy(TEST_DATADICT)
        package_1_data['owner_org'] = organization['name']
        package_1_data['private'] = False
        for pid in package_1_data.get('pids', []):
            pid['id'] = utils.generate_pid()

        package = get_action('package_create')({
            'user': '******'
        }, package_1_data)
        package_name = package['name']
        url = url_for('/oai')
        result = self.app.get(
            url, {
                'verb': 'GetRecord',
                'identifier': package_name,
                'metadataPrefix': 'rdf'
            })

        root = lxml.etree.fromstring(result.body)
        expected = ['Keilaniemi (populated place),Espoo (city)']

        found = 0
        for spatial in self._get_results(
                root,
                "//dct:spatial_ref/rdf:Description/dct:Location/rdf:Description/rdfs:label"
        ):
            self.assertTrue(spatial.text in expected)
            found += 1
        self.assertEquals(1, found, "Unexpected coverage results")

        get_action('organization_delete')({
            'user': '******'
        }, {
            'id': organization['id']
        })
    def test_coverage_spatial_rdf(self):
        organization = get_action('organization_create')({'user': '******'}, {'name': 'test-organization-coverage-rdf', 'title': "Test organization rdf"})
        package_1_data = deepcopy(TEST_DATADICT)
        package_1_data['owner_org'] = organization['name']
        package_1_data['private'] = False
        for pid in package_1_data.get('pids', []):
            pid['id'] = utils.generate_pid()

        package = get_action('package_create')({'user': '******'}, package_1_data)
        package_name = package['name']
        url = url_for('/oai')
        result = self.app.get(url, {'verb': 'GetRecord', 'identifier': package_name, 'metadataPrefix': 'rdf'})

        root = lxml.etree.fromstring(result.body)
        expected = ['Keilaniemi (populated place),Espoo (city)']

        found = 0
        for spatial in self._get_results(root, "//dct:spatial_ref/rdf:Description/dct:Location/rdf:Description/rdfs:label"):
            self.assertTrue(spatial.text in expected)
            found += 1
        self.assertEquals(1, found, "Unexpected coverage results")

        get_action('organization_delete')({'user': '******'}, {'id': organization['id']})
    def test_coverage(self):
        model.User(name="test_coverage", sysadmin=True).save()
        organization = get_action('organization_create')({'user': '******'}, {'name': 'test-organization-coverage', 'title': "Test organization"})
        package_1_data = deepcopy(TEST_DATADICT)
        package_1_data['owner_org'] = organization['name']
        package_1_data['private'] = False
        for pid in package_1_data.get('pids', []):
            pid['id'] = utils.generate_pid()

        package = get_action('package_create')({'user': '******'}, package_1_data)
        package_name = package['name']
        url = url_for('/oai')
        result = self.app.get(url, {'verb': 'GetRecord', 'identifier': package_name, 'metadataPrefix': 'oai_dc'})

        root = lxml.etree.fromstring(result.body)
        expected = ['Keilaniemi (populated place)', 'Espoo (city)', '2003-07-10T06:36:27-12:00/2010-04-15T03:24:47+12:45']

        found = 0
        for coverage in self._get_results(root, "//dc:coverage"):
            self.assertTrue(coverage.text in expected)
            found += 1
        self.assertEquals(3, found, "Unexpected coverage results")

        get_action('organization_delete')({'user': '******'}, {'id': organization['id']})
    def test_records(self):
        """ Test record fetching via http-request to prevent accidental changes to interface """
        model.User(name="test", sysadmin=True).save()
        organization = get_action('organization_create')(
            {
                'user': '******'
            }, {
                'name': 'test-organization',
                'title': "Test organization"
            })
        package_1_data = deepcopy(TEST_DATADICT)
        package_1_data['owner_org'] = organization['name']
        package_1_data['private'] = False
        package_2_data = deepcopy(package_1_data)

        for pid in package_1_data.get('pids', []):
            pid['id'] = utils.generate_pid()
        for pid in package_2_data.get('pids', []):
            pid['id'] = utils.generate_pid()

        packages = [
            get_action('package_create')({
                'user': '******'
            }, package_1_data),
            get_action('package_create')({
                'user': '******'
            }, package_2_data)
        ]

        url = url_for('/oai')
        result = self.app.get(url, {'verb': 'ListSets'})

        root = lxml.etree.fromstring(result.body)
        request_set = self._get_single_result(root, "//o:set")

        set_name = request_set.xpath("string(o:setName)",
                                     namespaces=self._namespaces)
        set_spec = request_set.xpath("string(o:setSpec)",
                                     namespaces=self._namespaces)
        self.assertEquals(organization['name'], set_spec)
        self.assertEquals(organization['title'], set_name)

        result = self.app.get(url, {
            'verb': 'ListIdentifiers',
            'set': set_spec,
            'metadataPrefix': 'oai_dc'
        })

        root = lxml.etree.fromstring(result.body)
        fail = True

        package_identifiers = [package['id'] for package in packages]
        package_org_names = [
            Group.get(package['owner_org']).name for package in packages
        ]

        for header in root.xpath("//o:header", namespaces=self._namespaces):
            fail = False
            set_spec = header.xpath("string(o:setSpec)",
                                    namespaces=self._namespaces)
            identifier = header.xpath("string(o:identifier)",
                                      namespaces=self._namespaces)
            self.assertTrue(set_spec in package_org_names)
            self.assertTrue(identifier in package_identifiers)

            result = self.app.get(
                url, {
                    'verb': 'GetRecord',
                    'identifier': identifier,
                    'metadataPrefix': 'oai_dc'
                })

            root = lxml.etree.fromstring(result.body)

            fail_record = True
            for record_result in root.xpath("//o:record",
                                            namespaces=self._namespaces):
                fail_record = False
                header = self._get_single_result(record_result, 'o:header')
                self._get_single_result(record_result, 'o:metadata')

                self.assertTrue(
                    header.xpath("string(o:identifier)",
                                 namespaces=self._namespaces) in
                    package_identifiers)
                self.assertTrue(
                    header.xpath("string(o:setSpec)",
                                 namespaces=self._namespaces) in
                    package_org_names)

            self.assertFalse(fail_record, "No records received")

        self.assertFalse(fail, "No headers (packages) received")
Beispiel #12
0
    def _ddi2ckan(self, original_url, original_xml, harvest_object):
        '''Extract package values from bs4 object 'ddi_xml' parsed from xml
        '''
        # TODO: Use .extract() and .string.extract() function so handled elements are removed from ddi_xml.
        doc_citation = "ddi_xml.codeBook.docDscr.citation"
        stdy_dscr = "ddi_xml.codeBook.stdyDscr"

        ####################################################################
        #      Read mandatory metadata fields:                             #
        ####################################################################
        # Authors & organizations
        authors = self.get_authors(self.ddi_xml.stdyDscr.citation, 'AuthEnty')
        agent = authors[:]
        agent.extend(self.get_contributors(self.ddi_xml.stdyDscr.citation))

        # Availability
        availability = AVAILABILITY_DEFAULT
        if _access_request_URL_is_found():
            availability = 'direct_download'
        if _is_fsd(original_url):
            availability = AVAILABILITY_FSD

        # Keywords
        keywords = self.get_keywords(self.ddi_xml.stdyDscr.stdyInfo.subject)

        # Language
        # TODO: Where/how to extract multiple languages: 'language': u'eng, fin, swe' ?
        language = self.convert_language(
            self._read_value("ddi_xml.codeBook.get('xml:lang')"))

        # Titles
        titles = self._read_value(stdy_dscr + ".citation.titlStmt(['titl', 'parTitl'])") or \
            self._read_value(doc_citation + ".titlStmt(['titl', 'parTitl'])", mandatory_field=True)

        # langtitle=[dict(lang=self.convert_language(a.get('xml:lang', '')), value=a.text) for a in titles]
        # [{"lang":"fin", "value":"otsikko"}, {"lang:"en", "value":"title"}]

        # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"}
        transl_json = {}
        first_title = ""

        # default to finnish, since first title has no lang value, which causes the validator to whine
        # we might want to update the DDI harvester to accept a language configuration parameter, if
        # we decide to harvest DDI resources from other sources.
        default_lang = "fi"
        for title in titles:
            transl_json[self.convert_language(title.get('xml:lang', default_lang))] = title.text

            # we want to get save the first title for use lateron
            if not first_title:
                first_title = title.text

        title = json.dumps(transl_json)

        # License
        # TODO: Extract prettier output. Should we check that element contains something?
        # Should this be in optional section if not mandatory_field?
        license_url = self._read_value(stdy_dscr + ".dataAccs.useStmt.get_text(separator=u' ')", mandatory_field=False)
        if _is_fsd(original_url):
            license_id = LICENSE_ID_FSD
        else:
            license_id = LICENSE_ID_DEFAULT

        # Contact (package_extra.key: contact_[k]_name in database, contact in WUI)
        contact_name = self._read_value(stdy_dscr + ".citation.distStmt('contact')") or \
                     self._read_value(stdy_dscr + ".citation.distStmt('distrbtr')") or \
                     self._read_value(doc_citation + ".prodStmt('producer')", mandatory_field=True)
        # TODO: clean out (or ask FSD to clean) mid text newlines (eg. in FSD2482)
        if contact_name and contact_name[0].text:
            contact_name = contact_name[0].text
        else:
            contact_name = self._read_value(stdy_dscr + ".citation.prodStmt.producer.get('affiliation')", mandatory_field=True)
        if _is_fsd(original_url):
            contact_email = CONTACT_EMAIL_FSD
            # TODO: Allow trying other email also in FSD metadata
        else:
            contact_email = self._read_value(stdy_dscr + ".citation.distStmt.contact.get('email')", mandatory_field=True)

        # Modified date
        version = self.get_attr_optional(self.ddi_xml.stdyDscr.citation,
                                         'prodDate', 'date') or \
                  self.get_attr_mandatory(self.ddi_xml.stdyDscr.citation,
                                          'version', 'date')

        # Name
        name_prefix = self._read_value(stdy_dscr + ".citation.titlStmt.IDNo.get('agency')", mandatory_field=False)
        name_id = self._read_value(stdy_dscr + ".citation.titlStmt.IDNo.text", mandatory_field=False)
        if not name_prefix:
            name_prefix = self._read_value(doc_citation + ".titlStmt.IDNo['agency']", mandatory_field=True)
        if not name_id:
            name_id = self._read_value(doc_citation + ".titlStmt.IDNo.text", mandatory_field=True)
        name = utils.datapid_to_name(name_prefix + name_id)

        pids = list()
        pids.append({'id': name, 'type': 'data', 'primary': 'True', 'provider': name_prefix})

        # Should we generate a version PID?
        # vpid = utils.generate_pid()
        # pids.append({'id': vpid, 'type': 'version', 'provider': 'kata'})

        # Original web page as resource
        # For FSD 'URI' leads to summary web page of data, hence format='html'
        orig_web_page = self._read_value(doc_citation + ".holdings.get('URI', '')")
        if orig_web_page:
            orig_web_page_resource = {'description': first_title,
                                      'format': u'html',
                                      'resource_type': 'documentation',
                                      'url': orig_web_page}
        else:
            orig_web_page_resource = {}

        # Owner
        owner = self._read_value(stdy_dscr + ".citation.prodStmt.producer.text") or \
                self._read_value(stdy_dscr + ".citation.rspStmt.AuthEnty.text") or \
                self._read_value(doc_citation + ".prodStmt.producer.string", mandatory_field=True)
        agent.append({'role': 'owner',
                      'name': owner})

        # Owner organisation
        if harvest_object:
            hsid = harvest_object.harvest_source_id
            hsooid = model.Session.query(model.Package).filter(model.Package.id==hsid).one().owner_org
            owner_org = model.Session.query(model.Group).filter(model.Group.id==hsooid).one().name
        else:
            owner_org = u''

        # Distributor (Agent: distributor, the same is used as contact)
        agent.append({
            'role': 'distributor',
            'name': contact_name})

        ####################################################################
        #      Read optional metadata fields:                              #
        ####################################################################
        # Availability
        if _is_fsd(original_url):
            access_request_url = ACCESS_REQUEST_URL_FSD
        else:
            access_request_url = u''

        # Contact
        contact_phone = self._read_value(doc_citation + ".holdings.get('callno')") or \
                        self._read_value(stdy_dscr + ".citation.holdings.get('callno')")

        contact_URL = self._read_value( stdy_dscr + ".dataAccs.setAvail.accsPlac.get('URI')") or \
                      self._read_value( stdy_dscr + ".citation.distStmt.contact.get('URI')") or \
                      self._read_value( stdy_dscr + ".citation.distStmt.distrbtr.get('URI')") or \
                      CONTACT_URL_FSD if _is_fsd(original_url) else None

        # convert the descriptions to a JSON string of type {"fin":"aineiston kuvaus", "eng","dataset description"}
        descriptions = self._read_value(stdy_dscr + ".stdyInfo.abstract('p')")
        if not descriptions:
            descriptions = self._read_value(stdy_dscr + ".citation.serStmt.serInfo('p')")
        translated_notes = {}

        for des in descriptions:
            lang = self.convert_language(des.get('xml:lang', 'fi'))
            if lang in translated_notes:
                translated_notes[lang] += '\r\n\r\n' + des.text
            else:
                translated_notes[lang] = des.text

        notes = json.dumps(translated_notes)

        # Discipline
        discipline = self.get_discipline(self.ddi_xml.stdyDscr.stdyInfo.subject)

        # Dataset lifetime events
        events = self._get_events(stdy_dscr, authors)

        # Geographic coverage
        geo_cover = self.get_geo_coverage(self.ddi_xml)

        # Temporal coverage
        temp_start, temp_end = self.get_temporal_coverage(self.ddi_xml)

        # Citation
        citation = self._read_value(stdy_dscr + ".citation.biblCit.text", mandatory_field=False)


        ####################################################################
        #      Flatten rest to 'XPath/path/to/element': 'value' pairs      #
        ####################################################################
        etree_xml = etree.fromstring(str(self.ddi_xml))
        flattened_ddi = importcore.generic_xml_metadata_reader(etree_xml.find('.//{*}docDscr'))
        xpath_dict = flattened_ddi.getMap()
        flattened_ddi = importcore.generic_xml_metadata_reader(etree_xml.find('.//{*}stdyDscr'))
        xpath_dict.update(flattened_ddi.getMap())


        package_dict = dict(
            access_application_URL=u'',
            access_request_URL=unicode(access_request_url),
            agent=agent,
            algorithm=u'',   # To be implemented straight in 'resources'
            availability=unicode(availability),
            contact=[{'name': contact_name,
                      'email': contact_email,
                      'URL': contact_URL,
                      'phone': contact_phone}],
            direct_download_URL=u'',  # To be implemented straight in 'resources
            discipline=discipline,
            event=events,
            geographic_coverage=geo_cover,
            groups=[],
            id=self._get_id_by_name(name) or generate_pid(),
            # langtitle=langtitle,
            langdis=u'True',  # HUOMAA!
            language=language,
            license_URL=license_url,
            license_id=license_id,
            mimetype=u'',  # To be implemented straight in 'resources
            name=name,
            notes=notes or u'',
            pids=pids,
            owner_org=owner_org,
            resources=[orig_web_page_resource],
            tag_string=keywords,
            temporal_coverage_begin=temp_start,
            temporal_coverage_end=temp_end,
            # title=langtitle[0].get('value'),   # Must exist in package dict
            title=title,
            type='dataset',
            version=version,
            version_PID='',
            citation=citation
        )
        package_dict['xpaths'] = xpath_dict
        # Above line creates:
        # package_dict = {
        #     'access_request_url': 'some_url',
        #     # ...
        #     'xpaths': {'stdyDscr/othrStdyMat.0/relPubl.34':
        #                'Uskon asia: nuorisobarometri 2006 (2006).'},
        #               {'stdyD...': 'Some value'}]
        # }
        #package_dict['extras'].update(_save_ddi_variables_to_csv(ddi_xml, somepkg))


        # Vanhojen koodien järjestys:
        #_save_original_xml_and_link_as_resources()
        #_save_ddi_variables_to_csv()
        #_create_group_based_on_organizations()
        #_last_statements_to_rewrite()

        # JuhoL: Set harvest object to some end state and commit
        if harvest_object is not None:
            harvest_object.content = None
            # Should this be flushed? model.Session.flush()
        #model.repo.commit()

        return package_dict
Beispiel #13
0
 def test_generate_pid2(self):
     pid = utils.generate_pid()
     pid2 = utils.generate_pid()
     assert pid != pid2
Beispiel #14
0
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
        - performing any necessary action with the fetched object (e.g create a CKAN package).
        Note: if this stage creates or updates a package, a reference
        to the package should be added to the HarvestObject.
        - creating the HarvestObject
        - Package relation (if necessary)
        - creating and storing any suitable HarvestObjectErrors that may occur.
        - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.report_status == "deleted":
            if harvest_object.package_id:
                get_action('package_delete')({
                    'model': model,
                    'session': model.Session,
                    'user': '******'
                }, {
                    'id': harvest_object.package_id
                })
                return True
            return True

        if not harvest_object.content:
            self._save_object_error(
                'Import: Empty content for object {id}'.format(
                    id=harvest_object.id), harvest_object)

            return False

        content = json.loads(harvest_object.content)
        # import pprint; pprint.pprint(content)

        package_dict = content.pop('unified')
        package_dict['xpaths'] = content

        # If package exists use old PID, otherwise create new
        pkg_id = ckanext.kata.utils.get_package_id_by_primary_pid(package_dict)
        pkg = Session.query(Package).filter(
            Package.id == pkg_id).first() if pkg_id else None
        log.debug('Package: "{pkg}"'.format(pkg=pkg))

        if pkg and not self._recreate(harvest_object):
            log.debug("Not re-creating package: %s", pkg_id)
            return True
        if not package_dict.get('id', None):
            package_dict['id'] = pkg.id if pkg else generate_pid()

        uploader = ''

        try:
            package = model.Package.get(harvest_object.harvest_source_id)
            if package and package.owner_org:
                package_dict['owner_org'] = package.owner_org

            config = self._get_configuration(harvest_object)
            if config.get('type') == 'ida':
                if package_dict.get('owner_org', False):
                    package_dict['private'] = "true"
                uploader = package_dict.get('uploader', False)
                package_dict.pop('uploader')
            if config.get('type') == 'ida':
                package_dict['persist_schema'] = u'True'
            schema = self.get_schema(config, pkg)
            # schema['xpaths'] = [ignore_missing, ckanext.kata.converters.xpath_to_extras]
            result = self._create_or_update_package(
                package_dict,
                harvest_object,
                schema=schema,
                # s_schema=ckanext.kata.plugin.KataPlugin.show_package_schema()
            )
            if uploader and asbool(c.get('kata.ldap.enabled', False)):
                try:
                    usr = ld.get_user_from_ldap(uploader)
                    if usr:
                        # by_openid leaves session hanging if usr is not set
                        usrname = model.User.by_openid(usr)
                    if usrname:
                        editor_dict = {
                            "name": package_dict['name'],
                            "role": "admin",
                            "username": usrname.name
                        }
                        context = {
                            'model': model,
                            'session': model.Session,
                            'user': '******'
                        }
                        try:
                            # if we fail the adding, no problem
                            ckanext.kata.actions.dataset_editor_add(
                                context, editor_dict)
                        except ValidationError:
                            pass
                        except NotFound:
                            pass
                        except NotAuthorized:
                            pass
                except:
                    pass
        except Exception as e:
            import traceback
            traceback.print_exc()
            self._save_object_error(
                'Import: Could not create {id}. {e}'.format(
                    id=harvest_object.id, e=e), harvest_object)
            return False

        return result
Beispiel #15
0
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
        - performing any necessary action with the fetched object (e.g create a CKAN package).
        Note: if this stage creates or updates a package, a reference
        to the package should be added to the HarvestObject.
        - creating the HarvestObject
        - Package relation (if necessary)
        - creating and storing any suitable HarvestObjectErrors that may occur.
        - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.report_status == "deleted":
            if harvest_object.package_id:
                get_action('package_delete')({'model': model, 'session': model.Session, 'user': '******'}, {'id': harvest_object.package_id})
                return True
            return True

        if not harvest_object.content:
            self._save_object_error('Import: Empty content for object {id}'.format(
                id=harvest_object.id), harvest_object)

            return False

        content = json.loads(harvest_object.content)
        # import pprint; pprint.pprint(content)

        package_dict = content.pop('unified')
        package_dict['xpaths'] = content

        # If package exists use old PID, otherwise create new
        pkg_id = ckanext.kata.utils.get_package_id_by_primary_pid(package_dict)
        pkg = Session.query(Package).filter(Package.id == pkg_id).first() if pkg_id else None
        log.debug('Package: "{pkg}"'.format(pkg=pkg))

        if pkg and not self._recreate(harvest_object):
            log.debug("Not re-creating package: %s", pkg_id)
            return True
        if not package_dict.get('id', None):
            package_dict['id'] = pkg.id if pkg else generate_pid()

        uploader = ''

        try:
            package = model.Package.get(harvest_object.harvest_source_id)
            if package and package.owner_org:
                package_dict['owner_org'] = package.owner_org

            config = self._get_configuration(harvest_object)
            if config.get('type') == 'ida':
                if package_dict.get('owner_org', False):
                    package_dict['private'] = "true"
                uploader = package_dict.get('uploader', False)
                package_dict.pop('uploader')
            if config.get('type') == 'ida':
                package_dict['persist_schema'] = u'True'
            schema = self.get_schema(config, pkg)
            # schema['xpaths'] = [ignore_missing, ckanext.kata.converters.xpath_to_extras]
            result = self._create_or_update_package(package_dict,
                                                    harvest_object,
                                                    schema=schema,
                                                    # s_schema=ckanext.kata.plugin.KataPlugin.show_package_schema()
                                                    )
            if uploader and asbool(c.get('kata.ldap.enabled', False)):
                try:
                    usr = ld.get_user_from_ldap(uploader)
                    if usr:
                        # by_openid leaves session hanging if usr is not set
                        usrname = model.User.by_openid(usr)
                    if usrname:
                        editor_dict = {"name": package_dict['name'],
                                       "role": "admin",
                                       "username": usrname.name
                                       }
                        context = {'model': model, 'session': model.Session,
                                   'user': '******'}
                        try:
                            # if we fail the adding, no problem
                            ckanext.kata.actions.dataset_editor_add(context, editor_dict)
                        except ValidationError:
                            pass
                        except NotFound:
                            pass
                        except NotAuthorized:
                            pass
                except:
                    pass
        except Exception as e:
            import traceback
            traceback.print_exc()
            self._save_object_error('Import: Could not create {id}. {e}'.format(
                id=harvest_object.id, e=e), harvest_object)
            return False

        return result
    def test_private_record(self):
        '''
        Test that private packages are not listed but public packages are

        '''
        package_1_data = deepcopy(TEST_DATADICT)
        model.User(name="privateuser", sysadmin=True).save()
        organization = get_action('organization_create')(
            {
                'user': '******'
            }, {
                'name': 'private-organization',
                'title': "Private organization"
            })
        package_1_data['private'] = True
        package_1_data['owner_org'] = organization['name']
        package_1_data['name'] = 'private-package'
        for pid in package_1_data.get('pids', []):
            pid['id'] = utils.generate_pid()
        package1 = get_action('package_create')({
            'user': '******'
        }, package_1_data)
        package_2_data = deepcopy(TEST_DATADICT)
        package_2_data['private'] = False
        package_2_data['owner_org'] = organization['name']
        package_2_data['name'] = 'public-package'
        for pid in package_2_data.get('pids', []):
            pid['id'] = utils.generate_pid()

        url = url_for('/oai')

        result = self.app.get(
            url, {
                'verb': 'ListIdentifiers',
                'set': 'private-organization',
                'metadataPrefix': 'oai_dc'
            })
        root = lxml.etree.fromstring(result.body)
        self.assertFalse(root.xpath("//o:header", namespaces=self._namespaces))

        now = datetime.datetime.isoformat(datetime.datetime.today())
        result = self.app.get(
            url, {
                'verb': 'ListRecords',
                'set': 'private-organization',
                'metadataPrefix': 'rdf',
                'until': now
            })
        root = lxml.etree.fromstring(result.body)
        self.assertFalse(root.xpath("//o:header", namespaces=self._namespaces))

        package2 = get_action('package_create')({
            'user': '******'
        }, package_2_data)
        result = self.app.get(
            url, {
                'verb': 'ListIdentifiers',
                'set': 'private-organization',
                'metadataPrefix': 'oai_dc'
            })
        root = lxml.etree.fromstring(result.body)
        for header in root.xpath("//o:header", namespaces=self._namespaces):
            identifier = header.xpath("string(o:identifier)",
                                      namespaces=self._namespaces)
            print identifier
            self.assertTrue(identifier == package2['id'])

        result = self.app.get(url, {
            'verb': 'ListRecords',
            'metadataPrefix': 'rdf'
        })
        root = lxml.etree.fromstring(result.body)
        for header in root.xpath("//o:header", namespaces=self._namespaces):
            identifier = header.xpath("string(o:identifier)",
                                      namespaces=self._namespaces)
            self.assertTrue(identifier == package2['id'])

        get_action('organization_delete')({
            'user': '******'
        }, {
            'id': organization['id']
        })
Beispiel #17
0
 def test_generate_pid(self):
     pid = utils.generate_pid()
     assert pid.startswith('urn')
     assert len(pid) >= 10
    def _read(self):
        project_funder, project_funding, project_name, project_homepage = _get_project_stuff(self.dc) or ('', '', '', '')

        # Todo! This needs to be improved to use also simple-dc
        # dc(filter_tag_name_namespace('publisher', ns['dc']), recursive=False)
        availability, license_id, license_url, access_application_url = _get_rights(self.dc) or ('', '', '', '')
        if not availability:
            availability = first(self._get_availability())

        uploader = self._get_uploader()

        data_pids = list(_get_data_pids(self.dc))

        tags = []
        #for tag in sorted([a.string for a in self.dc('subject', recursive=False)]):
        #    tags.extend(self._resolve_tags(tag))
        tags = [a.string for a in self.dc('subject', recursive=False)]

        transl_json = {}
        for title in self.dc('title', recursive=False):
            lang = utils.convert_language(title.get('xml:lang', '').strip())
            transl_json[lang] = title.string.strip()

        title = json.dumps(transl_json)

        def _get_primary_pid(data_pids):
            for dpid in data_pids:
                if dpid.startswith('urn:nbn:fi:csc-ida'):
                    data_pids.remove(dpid)
                    return [dpid]
            return []

        # Create a unified internal harvester format dict
        unified = dict(
            # ?=dc('source', recursive=False),
            # ?=dc('relation', recursive=False),
            # ?=dc('type', recursive=False),

            access_application_URL=access_application_url or '',

            # Todo! Implement
            access_request_URL='',

            algorithm=first(_get_algorithm(self.dc)) or '',

            # TODO: Handle availabilities better
            availability=availability,

            checksum=_get_checksum(self.dc) or '',

            direct_download_URL=first(_get_download(self.dc)) or '',

            # Todo! Implement
            discipline='',

            # Todo! Should be possible to implement with QDC, but not with OAI_DC
            # evdescr=[],
            # evtype=[],
            # evwhen=[],
            # evwho=[],

            # Todo! Implement
            geographic_coverage='',

            #langtitle=[dict(lang=a.get('xml:lang', ''), value=a.string) for a in self.dc('title', recursive=False)],

            title=title,

            language=','.join(sorted([a.string for a in self.dc('language', recursive=False)])),

            license_URL=license_url or '',
            license_id=license_id or 'notspecified',

            # Todo! Using only the first entry, for now
            contact=[dict(name=name or "", email=email or "", URL=url or "", phone=phone or "")
                     for name, email, phone, url in self._get_maintainer_stuff()],

            # Todo! IDA currently doesn't produce this, maybe in future
            # dc('hasFormat', recursive=False)
            mimetype=self._get_mime_type(),

            notes=self._read_notes(),

            # Todo! Using only the first entry, for now
            # owner=first([a.get('resource') for a in dc('rightsHolder', recursive=False)]) or '',

            pids=[dict(id=pid, provider=_get_provider(self.bs), type=u'primary') for pid in _get_primary_pid(data_pids)] +
                 [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in data_pids] +
                 [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in self._get_version_pids()] +
                 [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in _get_metadata_pid(self.dc)],

            agent=[dict(role='author', name=orgauth.get('value', ''), id='', organisation=orgauth.get('org', ''), URL='', fundingid='') for orgauth in _get_org_auth(self.dc)] +
                  [dict(role='contributor', name=contributor.get('value', ''), id='', organisation=contributor.get('org', ''), URL='', fundingid='') for contributor in _get_contributor(self.dc)] +
                  [dict(role='funder', name=first(project_name) or '', id=first(project_name) or '', organisation=first(project_funder) or "", URL=first(project_homepage) or '', fundingid=first(project_funding) or '',)] +
                  [dict(role='owner', name=first([a.get('resource') for a in self.dc('rightsHolder', recursive=False)]) or first(_get_rightsholder(self.dc)) or '', id='', organisation='', URL='', fundingid='')],

            tag_string=','.join(tags) or '',

            # Todo! Implement if possible
            temporal_coverage_begin='',
            temporal_coverage_end='',

            type='dataset',
            uploader=uploader,

            # Used in smear harvest code to extract variable, station and year values, but is not used when
            # creating the dataset via API.
            smear_url=first(_get_download(self.dc, False)) or '',

            # Todo! This should be more exactly picked
            version=(self.dc.modified or self.dc.date).string if (self.dc.modified or self.dc.date) else '',
            # version=dc(
            #     partial(filter_tag_name_namespace, 'modified', ns['dct']), recursive=False)[0].string or dc(
            #         partial(filter_tag_name_namespace, 'date', ns['dc']), recursive=False)[0].string,

        )
        if not unified['language']:
            unified['langdis'] = 'True'

        # Create id and name
        unified['id'] = generate_pid()
        unified['name'] = pid_to_name(unified['id'])

        # If primary pid is missing, set package id as primary pid
        if not any(pid.get('type', None) == u'primary' for pid in unified['pids']):
            unified['pids'].append(dict(id=unified['id'], type=u'primary', provider=None))

        # if not unified['project_name']:
        #    unified['projdis'] = 'True'
        return unified
 def test_generate_pid(self):
     pid = utils.generate_pid()
     assert pid.startswith('urn')
     assert len(pid) >= 10
 def test_generate_pid2(self):
     pid = utils.generate_pid()
     pid2 = utils.generate_pid()
     assert pid != pid2
Beispiel #21
0
 def get_unique_pids(self, ddict):
     for pid in ddict.get('pids', []):
         pid['id'] = utils.generate_pid()
     return ddict
    def _read(self):
        project_funder, project_funding, project_name, project_homepage = _get_project_stuff(
            self.dc) or ('', '', '', '')

        # Todo! This needs to be improved to use also simple-dc
        # dc(filter_tag_name_namespace('publisher', ns['dc']), recursive=False)
        availability, license_id, license_url, access_application_url = _get_rights(
            self.dc) or ('', '', '', '')
        if not availability:
            availability = first(self._get_availability())

        uploader = self._get_uploader()

        data_pids = list(_get_data_pids(self.dc))

        tags = []
        #for tag in sorted([a.string for a in self.dc('subject', recursive=False)]):
        #    tags.extend(self._resolve_tags(tag))
        tags = [a.string for a in self.dc('subject', recursive=False)]

        transl_json = {}
        for title in self.dc('title', recursive=False):
            lang = utils.convert_language(title.get('xml:lang', '').strip())
            transl_json[lang] = title.string.strip()

        title = json.dumps(transl_json)

        def _get_primary_pid(data_pids):
            for dpid in data_pids:
                if dpid.startswith('urn:nbn:fi:csc-ida'):
                    data_pids.remove(dpid)
                    return [dpid]
            return []

        # Create a unified internal harvester format dict
        unified = dict(
            # ?=dc('source', recursive=False),
            # ?=dc('relation', recursive=False),
            # ?=dc('type', recursive=False),
            access_application_URL=access_application_url or '',

            # Todo! Implement
            access_request_URL='',
            algorithm=first(_get_algorithm(self.dc)) or '',

            # TODO: Handle availabilities better
            availability=availability,
            checksum=_get_checksum(self.dc) or '',
            direct_download_URL=first(_get_download(self.dc)) or '',

            # Todo! Implement
            discipline='',

            # Todo! Should be possible to implement with QDC, but not with OAI_DC
            # evdescr=[],
            # evtype=[],
            # evwhen=[],
            # evwho=[],

            # Todo! Implement
            geographic_coverage='',

            #langtitle=[dict(lang=a.get('xml:lang', ''), value=a.string) for a in self.dc('title', recursive=False)],
            title=title,
            language=','.join(
                sorted(
                    [a.string for a in self.dc('language', recursive=False)])),
            license_URL=license_url or '',
            license_id=license_id or 'notspecified',

            # Todo! Using only the first entry, for now
            contact=[
                dict(name=name or "",
                     email=email or "",
                     URL=url or "",
                     phone=phone or "")
                for name, email, phone, url in self._get_maintainer_stuff()
            ],

            # Todo! IDA currently doesn't produce this, maybe in future
            # dc('hasFormat', recursive=False)
            mimetype=self._get_mime_type(),
            notes=self._read_notes(),

            # Todo! Using only the first entry, for now
            # owner=first([a.get('resource') for a in dc('rightsHolder', recursive=False)]) or '',
            pids=[
                dict(id=pid, provider=_get_provider(self.bs), type=u'primary')
                for pid in _get_primary_pid(data_pids)
            ] + [
                dict(id=pid,
                     provider=_get_provider(self.bs),
                     type=u'relation',
                     relation=u'generalRelation') for pid in data_pids
            ] + [
                dict(id=pid,
                     provider=_get_provider(self.bs),
                     type=u'relation',
                     relation=u'generalRelation')
                for pid in self._get_version_pids()
            ] + [
                dict(id=pid,
                     provider=_get_provider(self.bs),
                     type=u'relation',
                     relation=u'generalRelation')
                for pid in _get_metadata_pid(self.dc)
            ],
            agent=[
                dict(role='author',
                     name=orgauth.get('value', ''),
                     id='',
                     organisation=orgauth.get('org', ''),
                     URL='',
                     fundingid='') for orgauth in _get_org_auth(self.dc)
            ] + [
                dict(role='contributor',
                     name=contributor.get('value', ''),
                     id='',
                     organisation=contributor.get('org', ''),
                     URL='',
                     fundingid='') for contributor in _get_contributor(self.dc)
            ] + [
                dict(
                    role='funder',
                    name=first(project_name) or '',
                    id=first(project_name) or '',
                    organisation=first(project_funder) or "",
                    URL=first(project_homepage) or '',
                    fundingid=first(project_funding) or '',
                )
            ] + [
                dict(role='owner',
                     name=first([
                         a.get('resource')
                         for a in self.dc('rightsHolder', recursive=False)
                     ]) or first(_get_rightsholder(self.dc)) or '',
                     id='',
                     organisation='',
                     URL='',
                     fundingid='')
            ],
            tag_string=','.join(tags) or '',

            # Todo! Implement if possible
            temporal_coverage_begin='',
            temporal_coverage_end='',
            type='dataset',
            uploader=uploader,

            # Used in smear harvest code to extract variable, station and year values, but is not used when
            # creating the dataset via API.
            smear_url=first(_get_download(self.dc, False)) or '',

            # Todo! This should be more exactly picked
            version=(self.dc.modified or self.dc.date).string if
            (self.dc.modified or self.dc.date) else '',
            # version=dc(
            #     partial(filter_tag_name_namespace, 'modified', ns['dct']), recursive=False)[0].string or dc(
            #         partial(filter_tag_name_namespace, 'date', ns['dc']), recursive=False)[0].string,
        )
        if not unified['language']:
            unified['langdis'] = 'True'

        # Create id and name
        unified['id'] = generate_pid()
        unified['name'] = pid_to_name(unified['id'])

        # If primary pid is missing, set package id as primary pid
        if not any(
                pid.get('type', None) == u'primary'
                for pid in unified['pids']):
            unified['pids'].append(
                dict(id=unified['id'], type=u'primary', provider=None))

        # if not unified['project_name']:
        #    unified['projdis'] = 'True'
        return unified