Example #1
0
    def populate_harvest_job(self, harvest_job, set_ids, config, client):
        # Check if this source has been harvested before
        previous_job = Session.query(HarvestJob) \
            .filter(HarvestJob.source == harvest_job.source) \
            .filter(HarvestJob.gather_finished != None) \
            .filter(HarvestJob.id != harvest_job.id) \
            .order_by(HarvestJob.gather_finished.desc()) \
            .limit(1).first()

        last_time = None
        if previous_job and previous_job.finished and model.Package.get(harvest_job.source.id).metadata_modified < previous_job.gather_started:
            last_time = previous_job.gather_started.isoformat()

        # Collect package ids
        package_ids = list(self.get_package_ids(set_ids, config, last_time, client))
        log.debug('Identifiers: %s', package_ids)

        if not self._recreate(harvest_job) and package_ids:
            converted_identifiers = {}
            for identifier in package_ids:
                converted_identifiers[pid_to_name(identifier)] = identifier
                if identifier.endswith(u'm'):
                    converted_identifiers[pid_to_name(u"%ss" % identifier[0:-1])] = identifier

            for package in model.Session.query(model.Package).filter(model.Package.name.in_(converted_identifiers.keys())).all():
                converted_name = package.name
                if converted_identifiers[converted_name] not in package_ids:
                    converted_name = "%sm" % converted_name[0:-1]
                package_ids.remove(converted_identifiers[converted_name])

        if previous_job:
            for previous_error in [error.guid for error in Session.query(HarvestObject).
                                   filter(HarvestObject.harvest_job_id == previous_job.id).
                                   filter(HarvestObject.state == 'ERROR').all()]:
                if previous_error not in package_ids:
                    package_ids.append(previous_error)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in islice(package_ids, config['limit']) if 'limit' in config else package_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=package_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                log.debug('Object ids: {i}'.format(i=object_ids))
                return object_ids
            else:
                self._save_gather_error('No packages received for URL: {u}'.format(
                    u=harvest_job.source.url), harvest_job)
                return None
        except Exception as e:
            self._save_gather_error('Gather: {e}'.format(e=e), harvest_job)
            raise
Example #2
0
    def test_resource_read_redirect(self):
        """
        resource_read should redirect to dataset page.
        """
        model.repo.new_revision()
        model.Session.commit()
        res_id = None
        pkg = model.Package.get(u'annakarenina')
        pkg.name = utils.pid_to_name(pkg.id)
        model.Package.save(pkg)
        for resource in pkg.resources:
            if 'Full text.' in resource.description:
                model.repo.new_revision()
                resource.resource_type = settings.RESOURCE_TYPE_DATASET
                model.Session.commit()
                res_id = resource.id

        offset = '/en' + url_for(controller='package', action='resource_read',
                                 id=pkg.id, resource_id=res_id)

        extra_environ = {'REMOTE_USER': '******'}
        result = self.app.get(offset, extra_environ=extra_environ)

        # Redirect should redirect to dataset page
        result = result.follow()

        assert result.body.count('Full text.') == 0
        assert len(etree.fromstring(result.body, parser=self.html_parser))
Example #3
0
    def test_resource_read_redirect(self):
        """
        resource_read should redirect to dataset page.
        """
        model.repo.new_revision()
        model.Session.commit()
        res_id = None
        pkg = model.Package.get(u'annakarenina')
        pkg.name = utils.pid_to_name(pkg.id)
        model.Package.save(pkg)
        for resource in pkg.resources:
            if 'Full text.' in resource.description:
                model.repo.new_revision()
                resource.resource_type = settings.RESOURCE_TYPE_DATASET
                model.Session.commit()
                res_id = resource.id

        offset = '/en' + url_for(controller='package',
                                 action='resource_read',
                                 id=pkg.id,
                                 resource_id=res_id)

        extra_environ = {'REMOTE_USER': '******'}
        result = self.app.get(offset, extra_environ=extra_environ)

        # Redirect should redirect to dataset page
        result = result.follow()

        assert result.body.count('Full text.') == 0
        assert len(etree.fromstring(result.body, parser=self.html_parser))
Example #4
0
def default_name_from_id(key, data, errors, context):
    '''
    In all cases, generate name from package.id

    :param key: key
    :param data: data
    :param errors: validation errors
    :param context: context
    '''

    data[key] = utils.pid_to_name(data.get(('id',)))
Example #5
0
def default_name_from_id(key, data, errors, context):
    '''
    In all cases, generate name from package.id

    :param key: key
    :param data: data
    :param errors: validation errors
    :param context: context
    '''

    data[key] = utils.pid_to_name(data.get(('id', )))
Example #6
0
 def test_reader(self):
     record = _get_record("cmdi_1.xml")
     metadata = CmdiReader("http://localhost/test")(record)
     content = metadata.getMap()
     package = content['unified']
     self.assertEquals(package.get('name', None),
                       utils.pid_to_name(package.get('id', None)))
     self.assertEquals(utils.get_primary_pid(package),
                       u'http://urn.fi/urn:nbn:fi:lb-20140730180')
     self.assertEquals(package.get('notes', None),
                       '{"eng": "Test description"}')
     self.assertEquals(package.get('version', None), '2012-09-07')
     self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}')
Example #7
0
    def read_data(self, xml):
        """ Extract package data from given XML.
        :param xml: xml element (lxml)
        :return: dictionary
        """
        cmd = first(
            xml.xpath('//oai:record/oai:metadata/cmd:CMD',
                      namespaces=self.namespaces))
        if cmd is None:
            raise CmdiReaderException(
                "Unexpected XML format: No CMD -element found")

        resource_info = cmd.xpath("//cmd:Components/cmd:resourceInfo",
                                  namespaces=self.namespaces)[0]
        if resource_info is None:
            raise CmdiReaderException(
                "Unexpected XML format: No resourceInfo -element found")

        metadata_identifiers = self._text_xpath(
            cmd, "//cmd:identificationInfo/cmd:identifier/text()")

        languages = self._text_xpath(
            cmd,
            "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:languageInfo/cmd:languageId/text()"
        )

        # convert the descriptions to a JSON string of type {"fin":"kuvaus", "eng","desc"}
        desc_json = {}
        for desc in xml.xpath("//cmd:identificationInfo/cmd:description",
                              namespaces=self.namespaces):
            lang = convert_language(
                desc.get('{http://www.w3.org/XML/1998/namespace}lang',
                         'undefined').strip())
            desc_json[lang] = unicode(desc.text).strip()

        description = json.dumps(desc_json)

        # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"}
        transl_json = {}
        for title in xml.xpath('//cmd:identificationInfo/cmd:resourceName',
                               namespaces=self.namespaces):
            lang = convert_language(
                title.get('{http://www.w3.org/XML/1998/namespace}lang',
                          'undefined').strip())
            transl_json[lang] = title.text.strip()

        title = json.dumps(transl_json)
        provider = self.provider
        version = first(
            self._text_xpath(
                resource_info,
                "//cmd:metadataInfo/cmd:metadataLastDateUpdated/text()")) or ""
        coverage = first(
            self._text_xpath(
                resource_info,
                "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:timeCoverageInfo/cmd:timeCoverage/text()"
            )) or ""

        pids = []
        primary_pid = ''
        direct_download_URL = ''
        access_request_URL = ''
        access_application_URL = ''

        # data_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:url/text()")

        for pid in [
                CmdiReader._language_bank_urn_pid_enhancement(metadata_pid)
                for metadata_pid in metadata_identifiers
        ]:
            if 'urn' in pid and not primary_pid:
                pids.append(dict(id=pid, provider=provider, type='primary'))
                primary_pid = pid
        #     else:
        #         pids.append(dict(id=pid, provider=provider, type='relation', relation='generalRelation'))
        #
        # pids += [dict(id=CmdiReader._language_bank_urn_pid_enhancement(pid), provider=provider, type='relation',
        #               relation='generalRelation') for pid in data_identifiers]

        license_identifier = CmdiReader._language_bank_license_enhancement(
            first(
                self._text_xpath(
                    resource_info,
                    "//cmd:distributionInfo/cmd:licenceInfo/cmd:licence/text()"
                )) or 'notspecified')
        availability = CmdiReader._language_bank_availability_from_license(
            license_identifier)

        if license_identifier.lower().strip() != 'undernegotiation':
            if availability == 'direct_download':
                direct_download_URL = primary_pid
            if availability == 'access_request':
                access_request_URL = primary_pid
            if availability == 'access_application_other':
                sliced_pid = primary_pid.rsplit('/', 1)
                if len(sliced_pid) >= 2:
                    access_application_URL = 'https://lbr.csc.fi/web/guest/catalogue?domain=LBR&target=basket&resource=' + sliced_pid[
                        1]

        temporal_coverage_begin = ""
        temporal_coverage_end = ""

        if coverage:
            split = [item.strip() for item in coverage.split("-")]
            if len(split) == 2:
                temporal_coverage_begin = split[0]
                temporal_coverage_end = split[1]

        # TODO: Check agent mapping.
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorPerson")
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson")
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson")
        #print "###", _get_persons(resource_info, "//cmd:contactPerson")
        #print "###", _get_persons(resource_info, "//cmd:metadataInfo/cmd:metadataCreator")

        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorOrganization")
        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization")
        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization")

        contacts = self._persons_as_contact(
            self._get_persons(resource_info, "//cmd:contactPerson"))

        agents = []
        agents.extend(
            self._persons_as_agent(
                self._get_persons(
                    resource_info,
                    "//cmd:distributionInfo/cmd:iprHolderPerson"), 'author'))
        agents.extend(
            self._persons_as_agent(
                self._get_persons(
                    resource_info,
                    "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson"
                ), 'owner'))

        agents.extend(
            self._organization_as_agent(
                self._get_organizations(
                    resource_info,
                    "//cmd:distributionInfo/cmd:iprHolderOrganization"),
                'author'))
        agents.extend(
            self._organization_as_agent(
                self._get_organizations(
                    resource_info,
                    "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization"
                ), 'owner'))

        existing_package_id = get_package_id_by_pid(primary_pid, u'primary')
        package_id = existing_package_id if existing_package_id else get_unique_package_id(
        )

        result = {
            'name': pid_to_name(package_id),
            'language': ",".join(languages),
            'pids': pids,
            'version': version,
            'notes': description,
            'title': title,
            'type': 'dataset',
            'contact': contacts,
            'agent': agents,
            'availability': availability,
            'direct_download_URL': direct_download_URL,
            'access_request_URL': access_request_URL,
            'access_application_URL': access_application_URL,
            'temporal_coverage_begin': temporal_coverage_begin,
            'temporal_coverage_end': temporal_coverage_end,
            'license_id': license_identifier,
            'license_URL': ''
        }

        if not languages:
            result['langdis'] = u'True'

        if package_id:
            result['id'] = package_id

        # TODO: Ask about distributionAccessMedium
        # _strip_first(_text_xpath(resource_info, "//cmd:distributionInfo/availability/text()"))
        # url = _strip_first(_text_xpath(resource_info, "//cmd:identificationInfo/cmd:url/text()"))

        return result
Example #8
0
    def test_import(self):
        source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
        source.save()
        job = HarvestJob(source=source)
        job.save()

        harvest_object = self._run_import("cmdi_1.xml", job)
        package_id = json.loads(harvest_object.content)['unified']['id']

        self.assertEquals(
            len(harvest_object.errors), 0, u"\n".join(
                unicode(error.message)
                for error in (harvest_object.errors or [])))

        package = get_action('package_show')({
            'user': '******'
        }, {
            'id': package_id
        })

        self.assertEquals(package.get('name', None),
                          utils.pid_to_name(package.get('id', None)))
        self.assertEquals(utils.get_primary_pid(package),
                          u'http://urn.fi/urn:nbn:fi:lb-20140730180')
        self.assertEquals(package.get('notes', None),
                          u'{"eng": "Test description"}')
        self.assertEquals(package.get('version', None), '2012-09-07')
        self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}')
        self.assertEquals(package.get('license_id', None), 'undernegotiation')

        provider = config['ckan.site_url']
        expected_pid = {
            u'id': u'http://islrn.org/resources/248-895-085-557-0',
            u'provider': provider,
            u'type': u'relation',
            u'relation': u'generalRelation'
        }

        self.assertTrue(expected_pid not in package.get('pids'))

        model.Session.flush()

        harvest_object = self._run_import("cmdi_2.xml", job)
        package_id = json.loads(harvest_object.content)['unified']['id']

        self.assertEquals(
            len(harvest_object.errors), 0, u"\n".join(
                unicode(error.message)
                for error in (harvest_object.errors or [])))

        package = get_action('package_show')({
            'user': '******'
        }, {
            'id': package_id
        })

        self.assertEquals(package['temporal_coverage_begin'], '1880')
        self.assertEquals(package['temporal_coverage_end'], '1939')
        self.assertEquals(package.get('license_id', None), 'other')
        # Delete package
        harvest_object = HarvestObject()
        harvest_object.content = None
        harvest_object.id = "test-cmdi-delete"
        harvest_object.guid = "test-cmdi-delete"
        harvest_object.source = job.source
        harvest_object.harvest_source_id = None
        harvest_object.job = job
        harvest_object.package_id = package.get('id')
        harvest_object.report_status = "deleted"
        harvest_object.save()

        self.harvester.import_stage(harvest_object)

        model.Session.flush()
        self.assertEquals(model.Package.get(package['id']).state, 'deleted')
Example #9
0
 def test_pid_to_name(self):
     name = utils.pid_to_name('http://example.com/some/thing?good=true')
     assert name
     assert '/' not in name
Example #10
0
    def _ddi2ckan(self, original_url, original_xml, harvest_object):
        '''Extract package values from bs4 object 'ddi_xml' parsed from xml
        '''
        # TODO: Use .extract() and .string.extract() function so handled elements are removed from ddi_xml.
        doc_citation = "ddi_xml.codeBook.docDscr.citation"
        stdy_dscr = "ddi_xml.codeBook.stdyDscr"

        ####################################################################
        #      Read mandatory metadata fields:                             #
        ####################################################################
        # Authors & organizations
        authors = self.get_authors(self.ddi_xml.stdyDscr.citation, 'AuthEnty')
        agent = authors[:]
        agent.extend(self.get_contributors(self.ddi_xml.stdyDscr.citation))

        # Availability
        availability = AVAILABILITY_DEFAULT
        if _access_request_URL_is_found():
            availability = 'direct_download'
        if _is_fsd(original_url):
            availability = AVAILABILITY_FSD

        # Keywords
        keywords = self.get_keywords(self.ddi_xml.stdyDscr.stdyInfo.subject)

        # Language
        # TODO: Where/how to extract multiple languages: 'language': u'eng, fin, swe' ?
        language = self.convert_language(
            self._read_value("ddi_xml.codeBook.get('xml:lang')"))

        # Titles
        titles = self._read_value(stdy_dscr + ".citation.titlStmt(['titl', 'parTitl'])") or \
            self._read_value(doc_citation + ".titlStmt(['titl', 'parTitl'])", mandatory_field=True)

        # langtitle=[dict(lang=self.convert_language(a.get('xml:lang', '')), value=a.text) for a in titles]
        # [{"lang":"fin", "value":"otsikko"}, {"lang:"en", "value":"title"}]

        # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"}
        transl_json = {}
        first_title = ""

        # default to finnish, since first title has no lang value, which causes the validator to whine
        # we might want to update the DDI harvester to accept a language configuration parameter, if
        # we decide to harvest DDI resources from other sources.
        default_lang = "fi"
        for title in titles:
            transl_json[self.convert_language(title.get('xml:lang', default_lang))] = title.text

            # we want to get save the first title for use lateron
            if not first_title:
                first_title = title.text

        title = json.dumps(transl_json)

        # License
        # TODO: Extract prettier output. Should we check that element contains something?
        # Should this be in optional section if not mandatory_field?
        license_url = self._read_value(stdy_dscr + ".dataAccs.useStmt.get_text(separator=u' ')", mandatory_field=False)
        if _is_fsd(original_url):
            license_id = LICENSE_ID_FSD
        else:
            license_id = LICENSE_ID_DEFAULT

        # Contact (package_extra.key: contact_[k]_name in database, contact in WUI)
        contact_name = self._read_value(stdy_dscr + ".citation.distStmt('contact')") or \
                     self._read_value(stdy_dscr + ".citation.distStmt('distrbtr')") or \
                     self._read_value(doc_citation + ".prodStmt('producer')", mandatory_field=True)
        # TODO: clean out (or ask FSD to clean) mid text newlines (eg. in FSD2482)
        if contact_name and contact_name[0].text:
            contact_name = contact_name[0].text
        else:
            contact_name = self._read_value(stdy_dscr + ".citation.prodStmt.producer.get('affiliation')", mandatory_field=True)
        if _is_fsd(original_url):
            contact_email = CONTACT_EMAIL_FSD
            # TODO: Allow trying other email also in FSD metadata
        else:
            contact_email = self._read_value(stdy_dscr + ".citation.distStmt.contact.get('email')", mandatory_field=True)

        # Modified date
        version = self.get_attr_optional(self.ddi_xml.stdyDscr.citation,
                                         'prodDate', 'date') or \
                  self.get_attr_mandatory(self.ddi_xml.stdyDscr.citation,
                                          'version', 'date')

        # This idNos is an FSD specific solution
        idNos = self._read_value(stdy_dscr + ".citation.titlStmt.find_all('IDNo')", mandatory_field=False)
        if not idNos:
            idNos = self._read_value(doc_citation + ".titlStmt.find_all('IDNo')", mandatory_field=True)

        pids = list()

        idNoValues = [bsIdNo.text for bsIdNo in idNos]
        agencies = [bsIdNo.get('agency') for bsIdNo in idNos]
        primary_pid = None
        if len(idNoValues) == len(agencies):
            for idNoVal, agency in zip(idNoValues, agencies):
                if agency == 'Kansalli' \
                             'skirjasto':
                    pids.append({'id': idNoVal, 'type': 'primary', 'provider': agency})
                    primary_pid = idNoVal
                else:
                    pids.append({'id': agency + idNoVal, 'type': 'relation', 'provider': agency, 'relation': 'generalRelation'})

        # Should we generate a version PID?
        # vpid = utils.generate_pid()
        # pids.append({'id': vpid, 'type': 'version', 'provider': 'kata'})

        # Original web page as resource
        # For FSD 'URI' leads to summary web page of data, hence format='html'
        orig_web_page = self._read_value(doc_citation + ".holdings.get('URI', '')")
        if orig_web_page:
            orig_web_page_resource = {'description': first_title,
                                      'format': u'html',
                                      'resource_type': 'documentation',
                                      'url': orig_web_page}
        else:
            orig_web_page_resource = {}

        # Owner
        owner = self._read_value(stdy_dscr + ".citation.prodStmt.producer.text") or \
                self._read_value(stdy_dscr + ".citation.rspStmt.AuthEnty.text") or \
                self._read_value(doc_citation + ".prodStmt.producer.string", mandatory_field=True)
        agent.append({'role': 'owner',
                      'name': owner})

        # Owner organisation
        if harvest_object:
            hsid = harvest_object.harvest_source_id
            hsooid = model.Session.query(model.Package).filter(model.Package.id==hsid).one().owner_org
            owner_org = model.Session.query(model.Group).filter(model.Group.id==hsooid).one().name
        else:
            owner_org = u''

        # Distributor (Agent: distributor, the same is used as contact)
        agent.append({
            'role': 'distributor',
            'name': contact_name})

        ####################################################################
        #      Read optional metadata fields:                              #
        ####################################################################
        # Availability
        if _is_fsd(original_url):
            access_request_url = ACCESS_REQUEST_URL_FSD
        else:
            access_request_url = u''

        # Contact
        contact_phone = self._read_value(doc_citation + ".holdings.get('callno')") or \
                        self._read_value(stdy_dscr + ".citation.holdings.get('callno')")

        contact_URL = self._read_value( stdy_dscr + ".dataAccs.setAvail.accsPlac.get('URI')") or \
                      self._read_value( stdy_dscr + ".citation.distStmt.contact.get('URI')") or \
                      self._read_value( stdy_dscr + ".citation.distStmt.distrbtr.get('URI')") or \
                      CONTACT_URL_FSD if _is_fsd(original_url) else None

        # convert the descriptions to a JSON string of type {"fin":"aineiston kuvaus", "eng","dataset description"}
        descriptions = self._read_value(stdy_dscr + ".stdyInfo.abstract('p')")
        if not descriptions:
            descriptions = self._read_value(stdy_dscr + ".citation.serStmt.serInfo('p')")
        translated_notes = {}

        for des in descriptions:
            lang = self.convert_language(des.get('xml:lang', 'fi'))
            if lang in translated_notes:
                translated_notes[lang] += '\r\n\r\n' + des.text
            else:
                translated_notes[lang] = des.text

        notes = json.dumps(translated_notes)

        # Discipline
        discipline = self.get_discipline(self.ddi_xml.stdyDscr.stdyInfo.subject)

        # Dataset lifetime events
        events = self._get_events(stdy_dscr, authors)

        # Geographic coverage
        geo_cover = self.get_geo_coverage(self.ddi_xml)

        # Temporal coverage
        temp_start, temp_end = self.get_temporal_coverage(self.ddi_xml)

        # Citation
        citation = self._read_value(stdy_dscr + ".citation.biblCit.text", mandatory_field=False)


        ####################################################################
        #      Flatten rest to 'XPath/path/to/element': 'value' pairs      #
        ####################################################################
        etree_xml = etree.fromstring(str(self.ddi_xml))
        flattened_ddi = importcore.generic_xml_metadata_reader(etree_xml.find('.//{*}docDscr'))
        xpath_dict = flattened_ddi.getMap()
        flattened_ddi = importcore.generic_xml_metadata_reader(etree_xml.find('.//{*}stdyDscr'))
        xpath_dict.update(flattened_ddi.getMap())

        existing_package_id = get_package_id_by_pid(primary_pid, u'primary')
        package_id = existing_package_id if existing_package_id else get_unique_package_id()
        package_name = pid_to_name(package_id)

        package_dict = dict(
            access_application_URL=u'',
            access_request_URL=unicode(access_request_url),
            agent=agent,
            algorithm=u'',   # To be implemented straight in 'resources'
            availability=unicode(availability),
            contact=[{'name': contact_name,
                      'email': contact_email,
                      'URL': contact_URL,
                      'phone': contact_phone}],
            direct_download_URL=u'',  # To be implemented straight in 'resources
            discipline=discipline,
            event=events,
            geographic_coverage=geo_cover,
            groups=[],
            id=package_id,
            langdis=u'True',  # HUOMAA!
            language=language,
            license_URL=license_url,
            license_id=license_id,
            mimetype=u'',  # To be implemented straight in 'resources
            name=package_name,
            notes=notes or u'',
            pids=pids,
            owner_org=owner_org,
            resources=[orig_web_page_resource],
            tag_string=keywords,
            temporal_coverage_begin=temp_start,
            temporal_coverage_end=temp_end,
            title=title,
            type='dataset',
            version=version,
            version_PID='',
            citation=citation
        )
        package_dict['xpaths'] = xpath_dict
        # Above line creates:
        # package_dict = {
        #     'access_request_url': 'some_url',
        #     # ...
        #     'xpaths': {'stdyDscr/othrStdyMat.0/relPubl.34':
        #                'Uskon asia: nuorisobarometri 2006 (2006).'},
        #               {'stdyD...': 'Some value'}]
        # }
        #package_dict['extras'].update(_save_ddi_variables_to_csv(ddi_xml, somepkg))


        # Vanhojen koodien järjestys:
        #_save_original_xml_and_link_as_resources()
        #_save_ddi_variables_to_csv()
        #_create_group_based_on_organizations()
        #_last_statements_to_rewrite()

        # JuhoL: Set harvest object to some end state and commit
        if harvest_object is not None:
            harvest_object.content = None
            # Should this be flushed? model.Session.flush()
        #model.repo.commit()

        return package_dict
Example #11
0
    def _read(self):
        project_funder, project_funding, project_name, project_homepage = _get_project_stuff(self.dc) or ('', '', '', '')

        # Todo! This needs to be improved to use also simple-dc
        # dc(filter_tag_name_namespace('publisher', ns['dc']), recursive=False)
        availability, license_id, license_url, access_application_url = _get_rights(self.dc) or ('', '', '', '')
        if not availability:
            availability = first(self._get_availability())

        uploader = self._get_uploader()

        data_pids = list(_get_data_pids(self.dc))

        tags = []
        #for tag in sorted([a.string for a in self.dc('subject', recursive=False)]):
        #    tags.extend(self._resolve_tags(tag))
        tags = [a.string for a in self.dc('subject', recursive=False)]

        transl_json = {}
        for title in self.dc('title', recursive=False):
            lang = utils.convert_language(title.get('xml:lang', '').strip())
            transl_json[lang] = title.string.strip()

        title = json.dumps(transl_json)

        def _get_primary_pid(data_pids):
            for dpid in data_pids:
                if dpid.startswith('urn:nbn:fi:csc-ida'):
                    data_pids.remove(dpid)
                    return [dpid]
            return []

        # Create a unified internal harvester format dict
        unified = dict(
            # ?=dc('source', recursive=False),
            # ?=dc('relation', recursive=False),
            # ?=dc('type', recursive=False),

            access_application_URL=access_application_url or '',

            # Todo! Implement
            access_request_URL='',

            algorithm=first(_get_algorithm(self.dc)) or '',

            # TODO: Handle availabilities better
            availability=availability,

            checksum=_get_checksum(self.dc) or '',

            direct_download_URL=first(_get_download(self.dc)) or '',

            # Todo! Implement
            discipline='',

            # Todo! Should be possible to implement with QDC, but not with OAI_DC
            # evdescr=[],
            # evtype=[],
            # evwhen=[],
            # evwho=[],

            # Todo! Implement
            geographic_coverage='',

            #langtitle=[dict(lang=a.get('xml:lang', ''), value=a.string) for a in self.dc('title', recursive=False)],

            title=title,

            language=','.join(sorted([a.string for a in self.dc('language', recursive=False)])),

            license_URL=license_url or '',
            license_id=license_id or 'notspecified',

            # Todo! Using only the first entry, for now
            contact=[dict(name=name or "", email=email or "", URL=url or "", phone=phone or "")
                     for name, email, phone, url in self._get_maintainer_stuff()],

            # Todo! IDA currently doesn't produce this, maybe in future
            # dc('hasFormat', recursive=False)
            mimetype=self._get_mime_type(),

            notes=self._read_notes(),

            # Todo! Using only the first entry, for now
            # owner=first([a.get('resource') for a in dc('rightsHolder', recursive=False)]) or '',

            pids=[dict(id=pid, provider=_get_provider(self.bs), type=u'primary') for pid in _get_primary_pid(data_pids)] +
                 [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in data_pids] +
                 [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in self._get_version_pids()] +
                 [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in _get_metadata_pid(self.dc)],

            agent=[dict(role='author', name=orgauth.get('value', ''), id='', organisation=orgauth.get('org', ''), URL='', fundingid='') for orgauth in _get_org_auth(self.dc)] +
                  [dict(role='contributor', name=contributor.get('value', ''), id='', organisation=contributor.get('org', ''), URL='', fundingid='') for contributor in _get_contributor(self.dc)] +
                  [dict(role='funder', name=first(project_name) or '', id=first(project_name) or '', organisation=first(project_funder) or "", URL=first(project_homepage) or '', fundingid=first(project_funding) or '',)] +
                  [dict(role='owner', name=first([a.get('resource') for a in self.dc('rightsHolder', recursive=False)]) or first(_get_rightsholder(self.dc)) or '', id='', organisation='', URL='', fundingid='')],

            tag_string=','.join(tags) or '',

            # Todo! Implement if possible
            temporal_coverage_begin='',
            temporal_coverage_end='',

            type='dataset',
            uploader=uploader,

            # Used in smear harvest code to extract variable, station and year values, but is not used when
            # creating the dataset via API.
            smear_url=first(_get_download(self.dc, False)) or '',

            # Todo! This should be more exactly picked
            version=(self.dc.modified or self.dc.date).string if (self.dc.modified or self.dc.date) else '',
            # version=dc(
            #     partial(filter_tag_name_namespace, 'modified', ns['dct']), recursive=False)[0].string or dc(
            #         partial(filter_tag_name_namespace, 'date', ns['dc']), recursive=False)[0].string,

        )
        if not unified['language']:
            unified['langdis'] = 'True'

        # Create id and name
        unified['id'] = generate_pid()
        unified['name'] = pid_to_name(unified['id'])

        # If primary pid is missing, set package id as primary pid
        if not any(pid.get('type', None) == u'primary' for pid in unified['pids']):
            unified['pids'].append(dict(id=unified['id'], type=u'primary', provider=None))

        # if not unified['project_name']:
        #    unified['projdis'] = 'True'
        return unified
Example #12
0
    def populate_harvest_job(self, harvest_job, set_ids, config, client):
        # Check if this source has been harvested before
        previous_job = Session.query(HarvestJob) \
            .filter(HarvestJob.source == harvest_job.source) \
            .filter(HarvestJob.gather_finished != None) \
            .filter(HarvestJob.id != harvest_job.id) \
            .order_by(HarvestJob.gather_finished.desc()) \
            .limit(1).first()

        last_time = None
        if previous_job and previous_job.finished and model.Package.get(
                harvest_job.source.id
        ).metadata_modified < previous_job.gather_started:
            last_time = previous_job.gather_started.isoformat()

        # Collect package ids
        package_ids = list(
            self.get_package_ids(set_ids, config, last_time, client))
        log.debug('Identifiers: %s', package_ids)

        if not self._recreate(harvest_job) and package_ids:
            converted_identifiers = {}
            for identifier in package_ids:
                converted_identifiers[pid_to_name(identifier)] = identifier
                if identifier.endswith(u'm'):
                    converted_identifiers[pid_to_name(
                        u"%ss" % identifier[0:-1])] = identifier

            for package in model.Session.query(model.Package).filter(
                    model.Package.name.in_(
                        converted_identifiers.keys())).all():
                converted_name = package.name
                if converted_identifiers[converted_name] not in package_ids:
                    converted_name = "%sm" % converted_name[0:-1]
                package_ids.remove(converted_identifiers[converted_name])

        if previous_job:
            for previous_error in [
                    error.guid
                    for error in Session.query(HarvestObject).filter(
                        HarvestObject.harvest_job_id == previous_job.id).
                    filter(HarvestObject.state == 'ERROR').all()
            ]:
                if previous_error not in package_ids:
                    package_ids.append(previous_error)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in islice(
                        package_ids,
                        config['limit']) if 'limit' in config else package_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=package_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                log.debug('Object ids: {i}'.format(i=object_ids))
                return object_ids
            else:
                self._save_gather_error(
                    'No packages received for URL: {u}'.format(
                        u=harvest_job.source.url), harvest_job)
                return None
        except Exception as e:
            self._save_gather_error('Gather: {e}'.format(e=e), harvest_job)
            raise
Example #13
0
    def read_data(self, xml):
        """ Extract package data from given XML.
        :param xml: xml element (lxml)
        :return: dictionary
        """
        cmd = first(xml.xpath('//oai:record/oai:metadata/cmd:CMD', namespaces=self.namespaces))
        if cmd is None:
            raise CmdiReaderException("Unexpected XML format: No CMD -element found")

        resource_info = cmd.xpath("//cmd:Components/cmd:resourceInfo", namespaces=self.namespaces)[0]
        if resource_info is None:
            raise CmdiReaderException("Unexpected XML format: No resourceInfo -element found")

        metadata_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:identifier/text()")

        languages = self._text_xpath(cmd, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:languageInfo/cmd:languageId/text()")

        # convert the descriptions to a JSON string of type {"fin":"kuvaus", "eng","desc"}
        desc_json = {}
        for desc in xml.xpath("//cmd:identificationInfo/cmd:description", namespaces=self.namespaces):
            lang = convert_language(desc.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip())
            desc_json[lang] = unicode(desc.text).strip()

        description = json.dumps(desc_json)

        # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"}
        transl_json = {}
        for title in xml.xpath('//cmd:identificationInfo/cmd:resourceName', namespaces=self.namespaces):
            lang = convert_language(title.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip())
            transl_json[lang] = title.text.strip()

        title = json.dumps(transl_json)
        provider = self.provider
        version = first(self._text_xpath(resource_info, "//cmd:metadataInfo/cmd:metadataLastDateUpdated/text()")) or ""
        coverage = first(self._text_xpath(resource_info, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:timeCoverageInfo/cmd:timeCoverage/text()")) or ""

        pids = []
        primary_pid = ''
        direct_download_URL = ''
        access_request_URL = ''
        access_application_URL = ''

        # data_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:url/text()")

        for pid in [CmdiReader._language_bank_urn_pid_enhancement(metadata_pid) for metadata_pid in metadata_identifiers]:
            if 'urn' in pid and not primary_pid:
                pids.append(dict(id=pid, provider=provider, type='primary'))
                primary_pid=pid
        #     else:
        #         pids.append(dict(id=pid, provider=provider, type='relation', relation='generalRelation'))
        #
        # pids += [dict(id=CmdiReader._language_bank_urn_pid_enhancement(pid), provider=provider, type='relation',
        #               relation='generalRelation') for pid in data_identifiers]

        license_identifier = CmdiReader._language_bank_license_enhancement(first(self._text_xpath(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licence/text()")) or 'notspecified')
        availability = CmdiReader._language_bank_availability_from_license(license_identifier)

        if license_identifier.lower().strip() != 'undernegotiation':
            if availability == 'direct_download':
                direct_download_URL = primary_pid
            if availability == 'access_request':
                access_request_URL = primary_pid
            if availability == 'access_application_other':
                sliced_pid = primary_pid.rsplit('/', 1)
                if len(sliced_pid) >= 2:
                    access_application_URL = 'https://lbr.csc.fi/web/guest/catalogue?domain=LBR&target=basket&resource=' + sliced_pid[1]

        temporal_coverage_begin = ""
        temporal_coverage_end = ""

        if coverage:
            split = [item.strip() for item in coverage.split("-")]
            if len(split) == 2:
                temporal_coverage_begin = split[0]
                temporal_coverage_end = split[1]

        # TODO: Check agent mapping.
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorPerson")
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson")
        #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson")
        #print "###", _get_persons(resource_info, "//cmd:contactPerson")
        #print "###", _get_persons(resource_info, "//cmd:metadataInfo/cmd:metadataCreator")

        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorOrganization")
        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization")
        #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization")

        contacts = self._persons_as_contact(self._get_persons(resource_info, "//cmd:contactPerson"))

        agents = []
        agents.extend(self._persons_as_agent(self._get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson"), 'author'))
        agents.extend(self._persons_as_agent(self._get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson"), 'owner'))

        agents.extend(self._organization_as_agent(self._get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization"), 'author'))
        agents.extend(self._organization_as_agent(self._get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization"), 'owner'))

        existing_package_id = get_package_id_by_pid(primary_pid, u'primary')
        package_id = existing_package_id if existing_package_id else get_unique_package_id()

        result = {'name': pid_to_name(package_id),
                  'language': ",".join(languages),
                  'pids': pids,
                  'version': version,
                  'notes': description,
                  'title': title,
                  'type': 'dataset',
                  'contact': contacts,
                  'agent': agents,
                  'availability': availability,
                  'direct_download_URL': direct_download_URL,
                  'access_request_URL': access_request_URL,
                  'access_application_URL': access_application_URL,
                  'temporal_coverage_begin': temporal_coverage_begin,
                  'temporal_coverage_end': temporal_coverage_end,
                  'license_id': license_identifier,
                  'license_URL': ''}

        if not languages:
            result['langdis'] = u'True'

        if package_id:
            result['id'] = package_id

        # TODO: Ask about distributionAccessMedium
        # _strip_first(_text_xpath(resource_info, "//cmd:distributionInfo/availability/text()"))
        # url = _strip_first(_text_xpath(resource_info, "//cmd:identificationInfo/cmd:url/text()"))

        return result
Example #14
0
    def _read(self):
        project_funder, project_funding, project_name, project_homepage = _get_project_stuff(
            self.dc) or ('', '', '', '')

        # Todo! This needs to be improved to use also simple-dc
        # dc(filter_tag_name_namespace('publisher', ns['dc']), recursive=False)
        availability, license_id, license_url, access_application_url = _get_rights(
            self.dc) or ('', '', '', '')
        if not availability:
            availability = first(self._get_availability())

        uploader = self._get_uploader()

        data_pids = list(_get_data_pids(self.dc))

        tags = []
        #for tag in sorted([a.string for a in self.dc('subject', recursive=False)]):
        #    tags.extend(self._resolve_tags(tag))
        tags = [a.string for a in self.dc('subject', recursive=False)]

        transl_json = {}
        for title in self.dc('title', recursive=False):
            lang = utils.convert_language(title.get('xml:lang', '').strip())
            transl_json[lang] = title.string.strip()

        title = json.dumps(transl_json)

        def _get_primary_pid(data_pids):
            for dpid in data_pids:
                if dpid.startswith('urn:nbn:fi:csc-ida'):
                    data_pids.remove(dpid)
                    return [dpid]
            return []

        # Create a unified internal harvester format dict
        unified = dict(
            # ?=dc('source', recursive=False),
            # ?=dc('relation', recursive=False),
            # ?=dc('type', recursive=False),
            access_application_URL=access_application_url or '',

            # Todo! Implement
            access_request_URL='',
            algorithm=first(_get_algorithm(self.dc)) or '',

            # TODO: Handle availabilities better
            availability=availability,
            checksum=_get_checksum(self.dc) or '',
            direct_download_URL=first(_get_download(self.dc)) or '',

            # Todo! Implement
            discipline='',

            # Todo! Should be possible to implement with QDC, but not with OAI_DC
            # evdescr=[],
            # evtype=[],
            # evwhen=[],
            # evwho=[],

            # Todo! Implement
            geographic_coverage='',

            #langtitle=[dict(lang=a.get('xml:lang', ''), value=a.string) for a in self.dc('title', recursive=False)],
            title=title,
            language=','.join(
                sorted(
                    [a.string for a in self.dc('language', recursive=False)])),
            license_URL=license_url or '',
            license_id=license_id or 'notspecified',

            # Todo! Using only the first entry, for now
            contact=[
                dict(name=name or "",
                     email=email or "",
                     URL=url or "",
                     phone=phone or "")
                for name, email, phone, url in self._get_maintainer_stuff()
            ],

            # Todo! IDA currently doesn't produce this, maybe in future
            # dc('hasFormat', recursive=False)
            mimetype=self._get_mime_type(),
            notes=self._read_notes(),

            # Todo! Using only the first entry, for now
            # owner=first([a.get('resource') for a in dc('rightsHolder', recursive=False)]) or '',
            pids=[
                dict(id=pid, provider=_get_provider(self.bs), type=u'primary')
                for pid in _get_primary_pid(data_pids)
            ] + [
                dict(id=pid,
                     provider=_get_provider(self.bs),
                     type=u'relation',
                     relation=u'generalRelation') for pid in data_pids
            ] + [
                dict(id=pid,
                     provider=_get_provider(self.bs),
                     type=u'relation',
                     relation=u'generalRelation')
                for pid in self._get_version_pids()
            ] + [
                dict(id=pid,
                     provider=_get_provider(self.bs),
                     type=u'relation',
                     relation=u'generalRelation')
                for pid in _get_metadata_pid(self.dc)
            ],
            agent=[
                dict(role='author',
                     name=orgauth.get('value', ''),
                     id='',
                     organisation=orgauth.get('org', ''),
                     URL='',
                     fundingid='') for orgauth in _get_org_auth(self.dc)
            ] + [
                dict(role='contributor',
                     name=contributor.get('value', ''),
                     id='',
                     organisation=contributor.get('org', ''),
                     URL='',
                     fundingid='') for contributor in _get_contributor(self.dc)
            ] + [
                dict(
                    role='funder',
                    name=first(project_name) or '',
                    id=first(project_name) or '',
                    organisation=first(project_funder) or "",
                    URL=first(project_homepage) or '',
                    fundingid=first(project_funding) or '',
                )
            ] + [
                dict(role='owner',
                     name=first([
                         a.get('resource')
                         for a in self.dc('rightsHolder', recursive=False)
                     ]) or first(_get_rightsholder(self.dc)) or '',
                     id='',
                     organisation='',
                     URL='',
                     fundingid='')
            ],
            tag_string=','.join(tags) or '',

            # Todo! Implement if possible
            temporal_coverage_begin='',
            temporal_coverage_end='',
            type='dataset',
            uploader=uploader,

            # Used in smear harvest code to extract variable, station and year values, but is not used when
            # creating the dataset via API.
            smear_url=first(_get_download(self.dc, False)) or '',

            # Todo! This should be more exactly picked
            version=(self.dc.modified or self.dc.date).string if
            (self.dc.modified or self.dc.date) else '',
            # version=dc(
            #     partial(filter_tag_name_namespace, 'modified', ns['dct']), recursive=False)[0].string or dc(
            #         partial(filter_tag_name_namespace, 'date', ns['dc']), recursive=False)[0].string,
        )
        if not unified['language']:
            unified['langdis'] = 'True'

        # Create id and name
        unified['id'] = generate_pid()
        unified['name'] = pid_to_name(unified['id'])

        # If primary pid is missing, set package id as primary pid
        if not any(
                pid.get('type', None) == u'primary'
                for pid in unified['pids']):
            unified['pids'].append(
                dict(id=unified['id'], type=u'primary', provider=None))

        # if not unified['project_name']:
        #    unified['projdis'] = 'True'
        return unified