def populate_harvest_job(self, harvest_job, set_ids, config, client):
        # Check if this source has been harvested before
        previous_job = Session.query(HarvestJob) \
            .filter(HarvestJob.source == harvest_job.source) \
            .filter(HarvestJob.gather_finished != None) \
            .filter(HarvestJob.id != harvest_job.id) \
            .order_by(HarvestJob.gather_finished.desc()) \
            .limit(1).first()

        last_time = None
        if previous_job and previous_job.finished and model.Package.get(harvest_job.source.id).metadata_modified < previous_job.gather_started:
            last_time = previous_job.gather_started.isoformat()

        # Collect package ids
        package_ids = list(self.get_package_ids(set_ids, config, last_time, client))
        log.debug('Identifiers: %s', package_ids)

        if not self._recreate(harvest_job) and package_ids:
            converted_identifiers = {}
            for identifier in package_ids:
                converted_identifiers[datapid_to_name(identifier)] = identifier
                if identifier.endswith(u'm'):
                    converted_identifiers[datapid_to_name(u"%ss" % identifier[0:-1])] = identifier

            for package in model.Session.query(model.Package).filter(model.Package.name.in_(converted_identifiers.keys())).all():
                converted_name = package.name
                if converted_identifiers[converted_name] not in package_ids:
                    converted_name = "%sm" % converted_name[0:-1]
                package_ids.remove(converted_identifiers[converted_name])

        if previous_job:
            for previous_error in [error.guid for error in Session.query(HarvestObject).
                                   filter(HarvestObject.harvest_job_id == previous_job.id).
                                   filter(HarvestObject.state == 'ERROR').all()]:
                if previous_error not in package_ids:
                    package_ids.append(previous_error)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in islice(package_ids, config['limit']) if 'limit' in config else package_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=package_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                log.debug('Object ids: {i}'.format(i=object_ids))
                return object_ids
            else:
                self._save_gather_error('No packages received for URL: {u}'.format(
                    u=harvest_job.source.url), harvest_job)
                return None
        except Exception as e:
            self._save_gather_error('Gather: {e}'.format(e=e), harvest_job)
            raise
    def _to_name(cls, identifier):
        """ Convert identifier to CKAN package name.

        :param identifier: identifier string
        :return: CKAN package name
        """
        return datapid_to_name(cls._to_identifier(identifier))
Exemple #3
0
def default_name_from_id(key, data, errors, context):
    '''
    If name not given, generate name from package.id

    :param key: key
    :param data: data
    :param errors: validation errors
    :param context: context
    '''
    if not data.get(key):
        id = data.get(('id',))

        data[key] = utils.datapid_to_name(id)
 def test_datapid_to_name(self):
     name = utils.datapid_to_name('http://example.com/some/thing?good=true')
     assert name
     assert '/' not in name
Exemple #5
0
    def _ddi2ckan(self, original_url, original_xml, harvest_object):
        '''Extract package values from bs4 object 'ddi_xml' parsed from xml
        '''
        # TODO: Use .extract() and .string.extract() function so handled elements are removed from ddi_xml.
        doc_citation = "ddi_xml.codeBook.docDscr.citation"
        stdy_dscr = "ddi_xml.codeBook.stdyDscr"

        ####################################################################
        #      Read mandatory metadata fields:                             #
        ####################################################################
        # Authors & organizations
        authors = self.get_authors(self.ddi_xml.stdyDscr.citation, 'AuthEnty')
        agent = authors[:]
        agent.extend(self.get_contributors(self.ddi_xml.stdyDscr.citation))

        # Availability
        availability = AVAILABILITY_DEFAULT
        if _access_request_URL_is_found():
            availability = 'direct_download'
        if _is_fsd(original_url):
            availability = AVAILABILITY_FSD

        # Keywords
        keywords = self.get_keywords(self.ddi_xml.stdyDscr.stdyInfo.subject)

        # Language
        # TODO: Where/how to extract multiple languages: 'language': u'eng, fin, swe' ?
        language = self.convert_language(
            self._read_value("ddi_xml.codeBook.get('xml:lang')"))

        # Titles
        titles = self._read_value(stdy_dscr + ".citation.titlStmt(['titl', 'parTitl'])") or \
            self._read_value(doc_citation + ".titlStmt(['titl', 'parTitl'])", mandatory_field=True)

        # langtitle=[dict(lang=self.convert_language(a.get('xml:lang', '')), value=a.text) for a in titles]
        # [{"lang":"fin", "value":"otsikko"}, {"lang:"en", "value":"title"}]

        # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"}
        transl_json = {}
        first_title = ""

        # default to finnish, since first title has no lang value, which causes the validator to whine
        # we might want to update the DDI harvester to accept a language configuration parameter, if
        # we decide to harvest DDI resources from other sources.
        default_lang = "fi"
        for title in titles:
            transl_json[self.convert_language(title.get('xml:lang', default_lang))] = title.text

            # we want to get save the first title for use lateron
            if not first_title:
                first_title = title.text

        title = json.dumps(transl_json)

        # License
        # TODO: Extract prettier output. Should we check that element contains something?
        # Should this be in optional section if not mandatory_field?
        license_url = self._read_value(stdy_dscr + ".dataAccs.useStmt.get_text(separator=u' ')", mandatory_field=False)
        if _is_fsd(original_url):
            license_id = LICENSE_ID_FSD
        else:
            license_id = LICENSE_ID_DEFAULT

        # Contact (package_extra.key: contact_[k]_name in database, contact in WUI)
        contact_name = self._read_value(stdy_dscr + ".citation.distStmt('contact')") or \
                     self._read_value(stdy_dscr + ".citation.distStmt('distrbtr')") or \
                     self._read_value(doc_citation + ".prodStmt('producer')", mandatory_field=True)
        # TODO: clean out (or ask FSD to clean) mid text newlines (eg. in FSD2482)
        if contact_name and contact_name[0].text:
            contact_name = contact_name[0].text
        else:
            contact_name = self._read_value(stdy_dscr + ".citation.prodStmt.producer.get('affiliation')", mandatory_field=True)
        if _is_fsd(original_url):
            contact_email = CONTACT_EMAIL_FSD
            # TODO: Allow trying other email also in FSD metadata
        else:
            contact_email = self._read_value(stdy_dscr + ".citation.distStmt.contact.get('email')", mandatory_field=True)

        # Modified date
        version = self.get_attr_optional(self.ddi_xml.stdyDscr.citation,
                                         'prodDate', 'date') or \
                  self.get_attr_mandatory(self.ddi_xml.stdyDscr.citation,
                                          'version', 'date')

        # Name
        name_prefix = self._read_value(stdy_dscr + ".citation.titlStmt.IDNo.get('agency')", mandatory_field=False)
        name_id = self._read_value(stdy_dscr + ".citation.titlStmt.IDNo.text", mandatory_field=False)
        if not name_prefix:
            name_prefix = self._read_value(doc_citation + ".titlStmt.IDNo['agency']", mandatory_field=True)
        if not name_id:
            name_id = self._read_value(doc_citation + ".titlStmt.IDNo.text", mandatory_field=True)
        name = utils.datapid_to_name(name_prefix + name_id)

        pids = list()
        pids.append({'id': name, 'type': 'data', 'primary': 'True', 'provider': name_prefix})

        # Should we generate a version PID?
        # vpid = utils.generate_pid()
        # pids.append({'id': vpid, 'type': 'version', 'provider': 'kata'})

        # Original web page as resource
        # For FSD 'URI' leads to summary web page of data, hence format='html'
        orig_web_page = self._read_value(doc_citation + ".holdings.get('URI', '')")
        if orig_web_page:
            orig_web_page_resource = {'description': first_title,
                                      'format': u'html',
                                      'resource_type': 'documentation',
                                      'url': orig_web_page}
        else:
            orig_web_page_resource = {}

        # Owner
        owner = self._read_value(stdy_dscr + ".citation.prodStmt.producer.text") or \
                self._read_value(stdy_dscr + ".citation.rspStmt.AuthEnty.text") or \
                self._read_value(doc_citation + ".prodStmt.producer.string", mandatory_field=True)
        agent.append({'role': 'owner',
                      'name': owner})

        # Owner organisation
        if harvest_object:
            hsid = harvest_object.harvest_source_id
            hsooid = model.Session.query(model.Package).filter(model.Package.id==hsid).one().owner_org
            owner_org = model.Session.query(model.Group).filter(model.Group.id==hsooid).one().name
        else:
            owner_org = u''

        # Distributor (Agent: distributor, the same is used as contact)
        agent.append({
            'role': 'distributor',
            'name': contact_name})

        ####################################################################
        #      Read optional metadata fields:                              #
        ####################################################################
        # Availability
        if _is_fsd(original_url):
            access_request_url = ACCESS_REQUEST_URL_FSD
        else:
            access_request_url = u''

        # Contact
        contact_phone = self._read_value(doc_citation + ".holdings.get('callno')") or \
                        self._read_value(stdy_dscr + ".citation.holdings.get('callno')")

        contact_URL = self._read_value( stdy_dscr + ".dataAccs.setAvail.accsPlac.get('URI')") or \
                      self._read_value( stdy_dscr + ".citation.distStmt.contact.get('URI')") or \
                      self._read_value( stdy_dscr + ".citation.distStmt.distrbtr.get('URI')") or \
                      CONTACT_URL_FSD if _is_fsd(original_url) else None

        # convert the descriptions to a JSON string of type {"fin":"aineiston kuvaus", "eng","dataset description"}
        descriptions = self._read_value(stdy_dscr + ".stdyInfo.abstract('p')")
        if not descriptions:
            descriptions = self._read_value(stdy_dscr + ".citation.serStmt.serInfo('p')")
        translated_notes = {}

        for des in descriptions:
            lang = self.convert_language(des.get('xml:lang', 'fi'))
            if lang in translated_notes:
                translated_notes[lang] += '\r\n\r\n' + des.text
            else:
                translated_notes[lang] = des.text

        notes = json.dumps(translated_notes)

        # Discipline
        discipline = self.get_discipline(self.ddi_xml.stdyDscr.stdyInfo.subject)

        # Dataset lifetime events
        events = self._get_events(stdy_dscr, authors)

        # Geographic coverage
        geo_cover = self.get_geo_coverage(self.ddi_xml)

        # Temporal coverage
        temp_start, temp_end = self.get_temporal_coverage(self.ddi_xml)

        # Citation
        citation = self._read_value(stdy_dscr + ".citation.biblCit.text", mandatory_field=False)


        ####################################################################
        #      Flatten rest to 'XPath/path/to/element': 'value' pairs      #
        ####################################################################
        etree_xml = etree.fromstring(str(self.ddi_xml))
        flattened_ddi = importcore.generic_xml_metadata_reader(etree_xml.find('.//{*}docDscr'))
        xpath_dict = flattened_ddi.getMap()
        flattened_ddi = importcore.generic_xml_metadata_reader(etree_xml.find('.//{*}stdyDscr'))
        xpath_dict.update(flattened_ddi.getMap())


        package_dict = dict(
            access_application_URL=u'',
            access_request_URL=unicode(access_request_url),
            agent=agent,
            algorithm=u'',   # To be implemented straight in 'resources'
            availability=unicode(availability),
            contact=[{'name': contact_name,
                      'email': contact_email,
                      'URL': contact_URL,
                      'phone': contact_phone}],
            direct_download_URL=u'',  # To be implemented straight in 'resources
            discipline=discipline,
            event=events,
            geographic_coverage=geo_cover,
            groups=[],
            id=self._get_id_by_name(name) or generate_pid(),
            # langtitle=langtitle,
            langdis=u'True',  # HUOMAA!
            language=language,
            license_URL=license_url,
            license_id=license_id,
            mimetype=u'',  # To be implemented straight in 'resources
            name=name,
            notes=notes or u'',
            pids=pids,
            owner_org=owner_org,
            resources=[orig_web_page_resource],
            tag_string=keywords,
            temporal_coverage_begin=temp_start,
            temporal_coverage_end=temp_end,
            # title=langtitle[0].get('value'),   # Must exist in package dict
            title=title,
            type='dataset',
            version=version,
            version_PID='',
            citation=citation
        )
        package_dict['xpaths'] = xpath_dict
        # Above line creates:
        # package_dict = {
        #     'access_request_url': 'some_url',
        #     # ...
        #     'xpaths': {'stdyDscr/othrStdyMat.0/relPubl.34':
        #                'Uskon asia: nuorisobarometri 2006 (2006).'},
        #               {'stdyD...': 'Some value'}]
        # }
        #package_dict['extras'].update(_save_ddi_variables_to_csv(ddi_xml, somepkg))


        # Vanhojen koodien järjestys:
        #_save_original_xml_and_link_as_resources()
        #_save_ddi_variables_to_csv()
        #_create_group_based_on_organizations()
        #_last_statements_to_rewrite()

        # JuhoL: Set harvest object to some end state and commit
        if harvest_object is not None:
            harvest_object.content = None
            # Should this be flushed? model.Session.flush()
        #model.repo.commit()

        return package_dict