def populate_harvest_job(self, harvest_job, set_ids, config, client): # Check if this source has been harvested before previous_job = Session.query(HarvestJob) \ .filter(HarvestJob.source == harvest_job.source) \ .filter(HarvestJob.gather_finished != None) \ .filter(HarvestJob.id != harvest_job.id) \ .order_by(HarvestJob.gather_finished.desc()) \ .limit(1).first() last_time = None if previous_job and previous_job.finished and model.Package.get(harvest_job.source.id).metadata_modified < previous_job.gather_started: last_time = previous_job.gather_started.isoformat() # Collect package ids package_ids = list(self.get_package_ids(set_ids, config, last_time, client)) log.debug('Identifiers: %s', package_ids) if not self._recreate(harvest_job) and package_ids: converted_identifiers = {} for identifier in package_ids: converted_identifiers[pid_to_name(identifier)] = identifier if identifier.endswith(u'm'): converted_identifiers[pid_to_name(u"%ss" % identifier[0:-1])] = identifier for package in model.Session.query(model.Package).filter(model.Package.name.in_(converted_identifiers.keys())).all(): converted_name = package.name if converted_identifiers[converted_name] not in package_ids: converted_name = "%sm" % converted_name[0:-1] package_ids.remove(converted_identifiers[converted_name]) if previous_job: for previous_error in [error.guid for error in Session.query(HarvestObject). filter(HarvestObject.harvest_job_id == previous_job.id). filter(HarvestObject.state == 'ERROR').all()]: if previous_error not in package_ids: package_ids.append(previous_error) try: object_ids = [] if len(package_ids): for package_id in islice(package_ids, config['limit']) if 'limit' in config else package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) log.debug('Object ids: {i}'.format(i=object_ids)) return object_ids else: self._save_gather_error('No packages received for URL: {u}'.format( u=harvest_job.source.url), harvest_job) return None except Exception as e: self._save_gather_error('Gather: {e}'.format(e=e), harvest_job) raise
def test_resource_read_redirect(self): """ resource_read should redirect to dataset page. """ model.repo.new_revision() model.Session.commit() res_id = None pkg = model.Package.get(u'annakarenina') pkg.name = utils.pid_to_name(pkg.id) model.Package.save(pkg) for resource in pkg.resources: if 'Full text.' in resource.description: model.repo.new_revision() resource.resource_type = settings.RESOURCE_TYPE_DATASET model.Session.commit() res_id = resource.id offset = '/en' + url_for(controller='package', action='resource_read', id=pkg.id, resource_id=res_id) extra_environ = {'REMOTE_USER': '******'} result = self.app.get(offset, extra_environ=extra_environ) # Redirect should redirect to dataset page result = result.follow() assert result.body.count('Full text.') == 0 assert len(etree.fromstring(result.body, parser=self.html_parser))
def default_name_from_id(key, data, errors, context): ''' In all cases, generate name from package.id :param key: key :param data: data :param errors: validation errors :param context: context ''' data[key] = utils.pid_to_name(data.get(('id',)))
def default_name_from_id(key, data, errors, context): ''' In all cases, generate name from package.id :param key: key :param data: data :param errors: validation errors :param context: context ''' data[key] = utils.pid_to_name(data.get(('id', )))
def test_reader(self): record = _get_record("cmdi_1.xml") metadata = CmdiReader("http://localhost/test")(record) content = metadata.getMap() package = content['unified'] self.assertEquals(package.get('name', None), utils.pid_to_name(package.get('id', None))) self.assertEquals(utils.get_primary_pid(package), u'http://urn.fi/urn:nbn:fi:lb-20140730180') self.assertEquals(package.get('notes', None), '{"eng": "Test description"}') self.assertEquals(package.get('version', None), '2012-09-07') self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}')
def read_data(self, xml): """ Extract package data from given XML. :param xml: xml element (lxml) :return: dictionary """ cmd = first( xml.xpath('//oai:record/oai:metadata/cmd:CMD', namespaces=self.namespaces)) if cmd is None: raise CmdiReaderException( "Unexpected XML format: No CMD -element found") resource_info = cmd.xpath("//cmd:Components/cmd:resourceInfo", namespaces=self.namespaces)[0] if resource_info is None: raise CmdiReaderException( "Unexpected XML format: No resourceInfo -element found") metadata_identifiers = self._text_xpath( cmd, "//cmd:identificationInfo/cmd:identifier/text()") languages = self._text_xpath( cmd, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:languageInfo/cmd:languageId/text()" ) # convert the descriptions to a JSON string of type {"fin":"kuvaus", "eng","desc"} desc_json = {} for desc in xml.xpath("//cmd:identificationInfo/cmd:description", namespaces=self.namespaces): lang = convert_language( desc.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip()) desc_json[lang] = unicode(desc.text).strip() description = json.dumps(desc_json) # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"} transl_json = {} for title in xml.xpath('//cmd:identificationInfo/cmd:resourceName', namespaces=self.namespaces): lang = convert_language( title.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip()) transl_json[lang] = title.text.strip() title = json.dumps(transl_json) provider = self.provider version = first( self._text_xpath( resource_info, "//cmd:metadataInfo/cmd:metadataLastDateUpdated/text()")) or "" coverage = first( self._text_xpath( resource_info, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:timeCoverageInfo/cmd:timeCoverage/text()" )) or "" pids = [] primary_pid = '' direct_download_URL = '' access_request_URL = '' access_application_URL = '' # data_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:url/text()") for pid in [ CmdiReader._language_bank_urn_pid_enhancement(metadata_pid) for metadata_pid in metadata_identifiers ]: if 'urn' in pid and not primary_pid: pids.append(dict(id=pid, provider=provider, type='primary')) primary_pid = pid # else: # pids.append(dict(id=pid, provider=provider, type='relation', relation='generalRelation')) # # pids += [dict(id=CmdiReader._language_bank_urn_pid_enhancement(pid), provider=provider, type='relation', # relation='generalRelation') for pid in data_identifiers] license_identifier = CmdiReader._language_bank_license_enhancement( first( self._text_xpath( resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licence/text()" )) or 'notspecified') availability = CmdiReader._language_bank_availability_from_license( license_identifier) if license_identifier.lower().strip() != 'undernegotiation': if availability == 'direct_download': direct_download_URL = primary_pid if availability == 'access_request': access_request_URL = primary_pid if availability == 'access_application_other': sliced_pid = primary_pid.rsplit('/', 1) if len(sliced_pid) >= 2: access_application_URL = 'https://lbr.csc.fi/web/guest/catalogue?domain=LBR&target=basket&resource=' + sliced_pid[ 1] temporal_coverage_begin = "" temporal_coverage_end = "" if coverage: split = [item.strip() for item in coverage.split("-")] if len(split) == 2: temporal_coverage_begin = split[0] temporal_coverage_end = split[1] # TODO: Check agent mapping. #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorPerson") #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson") #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson") #print "###", _get_persons(resource_info, "//cmd:contactPerson") #print "###", _get_persons(resource_info, "//cmd:metadataInfo/cmd:metadataCreator") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorOrganization") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization") contacts = self._persons_as_contact( self._get_persons(resource_info, "//cmd:contactPerson")) agents = [] agents.extend( self._persons_as_agent( self._get_persons( resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson"), 'author')) agents.extend( self._persons_as_agent( self._get_persons( resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson" ), 'owner')) agents.extend( self._organization_as_agent( self._get_organizations( resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization"), 'author')) agents.extend( self._organization_as_agent( self._get_organizations( resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization" ), 'owner')) existing_package_id = get_package_id_by_pid(primary_pid, u'primary') package_id = existing_package_id if existing_package_id else get_unique_package_id( ) result = { 'name': pid_to_name(package_id), 'language': ",".join(languages), 'pids': pids, 'version': version, 'notes': description, 'title': title, 'type': 'dataset', 'contact': contacts, 'agent': agents, 'availability': availability, 'direct_download_URL': direct_download_URL, 'access_request_URL': access_request_URL, 'access_application_URL': access_application_URL, 'temporal_coverage_begin': temporal_coverage_begin, 'temporal_coverage_end': temporal_coverage_end, 'license_id': license_identifier, 'license_URL': '' } if not languages: result['langdis'] = u'True' if package_id: result['id'] = package_id # TODO: Ask about distributionAccessMedium # _strip_first(_text_xpath(resource_info, "//cmd:distributionInfo/availability/text()")) # url = _strip_first(_text_xpath(resource_info, "//cmd:identificationInfo/cmd:url/text()")) return result
def test_import(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() harvest_object = self._run_import("cmdi_1.xml", job) package_id = json.loads(harvest_object.content)['unified']['id'] self.assertEquals( len(harvest_object.errors), 0, u"\n".join( unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({ 'user': '******' }, { 'id': package_id }) self.assertEquals(package.get('name', None), utils.pid_to_name(package.get('id', None))) self.assertEquals(utils.get_primary_pid(package), u'http://urn.fi/urn:nbn:fi:lb-20140730180') self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}') self.assertEquals(package.get('version', None), '2012-09-07') self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}') self.assertEquals(package.get('license_id', None), 'undernegotiation') provider = config['ckan.site_url'] expected_pid = { u'id': u'http://islrn.org/resources/248-895-085-557-0', u'provider': provider, u'type': u'relation', u'relation': u'generalRelation' } self.assertTrue(expected_pid not in package.get('pids')) model.Session.flush() harvest_object = self._run_import("cmdi_2.xml", job) package_id = json.loads(harvest_object.content)['unified']['id'] self.assertEquals( len(harvest_object.errors), 0, u"\n".join( unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({ 'user': '******' }, { 'id': package_id }) self.assertEquals(package['temporal_coverage_begin'], '1880') self.assertEquals(package['temporal_coverage_end'], '1939') self.assertEquals(package.get('license_id', None), 'other') # Delete package harvest_object = HarvestObject() harvest_object.content = None harvest_object.id = "test-cmdi-delete" harvest_object.guid = "test-cmdi-delete" harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.package_id = package.get('id') harvest_object.report_status = "deleted" harvest_object.save() self.harvester.import_stage(harvest_object) model.Session.flush() self.assertEquals(model.Package.get(package['id']).state, 'deleted')
def test_pid_to_name(self): name = utils.pid_to_name('http://example.com/some/thing?good=true') assert name assert '/' not in name
def _ddi2ckan(self, original_url, original_xml, harvest_object): '''Extract package values from bs4 object 'ddi_xml' parsed from xml ''' # TODO: Use .extract() and .string.extract() function so handled elements are removed from ddi_xml. doc_citation = "ddi_xml.codeBook.docDscr.citation" stdy_dscr = "ddi_xml.codeBook.stdyDscr" #################################################################### # Read mandatory metadata fields: # #################################################################### # Authors & organizations authors = self.get_authors(self.ddi_xml.stdyDscr.citation, 'AuthEnty') agent = authors[:] agent.extend(self.get_contributors(self.ddi_xml.stdyDscr.citation)) # Availability availability = AVAILABILITY_DEFAULT if _access_request_URL_is_found(): availability = 'direct_download' if _is_fsd(original_url): availability = AVAILABILITY_FSD # Keywords keywords = self.get_keywords(self.ddi_xml.stdyDscr.stdyInfo.subject) # Language # TODO: Where/how to extract multiple languages: 'language': u'eng, fin, swe' ? language = self.convert_language( self._read_value("ddi_xml.codeBook.get('xml:lang')")) # Titles titles = self._read_value(stdy_dscr + ".citation.titlStmt(['titl', 'parTitl'])") or \ self._read_value(doc_citation + ".titlStmt(['titl', 'parTitl'])", mandatory_field=True) # langtitle=[dict(lang=self.convert_language(a.get('xml:lang', '')), value=a.text) for a in titles] # [{"lang":"fin", "value":"otsikko"}, {"lang:"en", "value":"title"}] # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"} transl_json = {} first_title = "" # default to finnish, since first title has no lang value, which causes the validator to whine # we might want to update the DDI harvester to accept a language configuration parameter, if # we decide to harvest DDI resources from other sources. default_lang = "fi" for title in titles: transl_json[self.convert_language(title.get('xml:lang', default_lang))] = title.text # we want to get save the first title for use lateron if not first_title: first_title = title.text title = json.dumps(transl_json) # License # TODO: Extract prettier output. Should we check that element contains something? # Should this be in optional section if not mandatory_field? license_url = self._read_value(stdy_dscr + ".dataAccs.useStmt.get_text(separator=u' ')", mandatory_field=False) if _is_fsd(original_url): license_id = LICENSE_ID_FSD else: license_id = LICENSE_ID_DEFAULT # Contact (package_extra.key: contact_[k]_name in database, contact in WUI) contact_name = self._read_value(stdy_dscr + ".citation.distStmt('contact')") or \ self._read_value(stdy_dscr + ".citation.distStmt('distrbtr')") or \ self._read_value(doc_citation + ".prodStmt('producer')", mandatory_field=True) # TODO: clean out (or ask FSD to clean) mid text newlines (eg. in FSD2482) if contact_name and contact_name[0].text: contact_name = contact_name[0].text else: contact_name = self._read_value(stdy_dscr + ".citation.prodStmt.producer.get('affiliation')", mandatory_field=True) if _is_fsd(original_url): contact_email = CONTACT_EMAIL_FSD # TODO: Allow trying other email also in FSD metadata else: contact_email = self._read_value(stdy_dscr + ".citation.distStmt.contact.get('email')", mandatory_field=True) # Modified date version = self.get_attr_optional(self.ddi_xml.stdyDscr.citation, 'prodDate', 'date') or \ self.get_attr_mandatory(self.ddi_xml.stdyDscr.citation, 'version', 'date') # This idNos is an FSD specific solution idNos = self._read_value(stdy_dscr + ".citation.titlStmt.find_all('IDNo')", mandatory_field=False) if not idNos: idNos = self._read_value(doc_citation + ".titlStmt.find_all('IDNo')", mandatory_field=True) pids = list() idNoValues = [bsIdNo.text for bsIdNo in idNos] agencies = [bsIdNo.get('agency') for bsIdNo in idNos] primary_pid = None if len(idNoValues) == len(agencies): for idNoVal, agency in zip(idNoValues, agencies): if agency == 'Kansalli' \ 'skirjasto': pids.append({'id': idNoVal, 'type': 'primary', 'provider': agency}) primary_pid = idNoVal else: pids.append({'id': agency + idNoVal, 'type': 'relation', 'provider': agency, 'relation': 'generalRelation'}) # Should we generate a version PID? # vpid = utils.generate_pid() # pids.append({'id': vpid, 'type': 'version', 'provider': 'kata'}) # Original web page as resource # For FSD 'URI' leads to summary web page of data, hence format='html' orig_web_page = self._read_value(doc_citation + ".holdings.get('URI', '')") if orig_web_page: orig_web_page_resource = {'description': first_title, 'format': u'html', 'resource_type': 'documentation', 'url': orig_web_page} else: orig_web_page_resource = {} # Owner owner = self._read_value(stdy_dscr + ".citation.prodStmt.producer.text") or \ self._read_value(stdy_dscr + ".citation.rspStmt.AuthEnty.text") or \ self._read_value(doc_citation + ".prodStmt.producer.string", mandatory_field=True) agent.append({'role': 'owner', 'name': owner}) # Owner organisation if harvest_object: hsid = harvest_object.harvest_source_id hsooid = model.Session.query(model.Package).filter(model.Package.id==hsid).one().owner_org owner_org = model.Session.query(model.Group).filter(model.Group.id==hsooid).one().name else: owner_org = u'' # Distributor (Agent: distributor, the same is used as contact) agent.append({ 'role': 'distributor', 'name': contact_name}) #################################################################### # Read optional metadata fields: # #################################################################### # Availability if _is_fsd(original_url): access_request_url = ACCESS_REQUEST_URL_FSD else: access_request_url = u'' # Contact contact_phone = self._read_value(doc_citation + ".holdings.get('callno')") or \ self._read_value(stdy_dscr + ".citation.holdings.get('callno')") contact_URL = self._read_value( stdy_dscr + ".dataAccs.setAvail.accsPlac.get('URI')") or \ self._read_value( stdy_dscr + ".citation.distStmt.contact.get('URI')") or \ self._read_value( stdy_dscr + ".citation.distStmt.distrbtr.get('URI')") or \ CONTACT_URL_FSD if _is_fsd(original_url) else None # convert the descriptions to a JSON string of type {"fin":"aineiston kuvaus", "eng","dataset description"} descriptions = self._read_value(stdy_dscr + ".stdyInfo.abstract('p')") if not descriptions: descriptions = self._read_value(stdy_dscr + ".citation.serStmt.serInfo('p')") translated_notes = {} for des in descriptions: lang = self.convert_language(des.get('xml:lang', 'fi')) if lang in translated_notes: translated_notes[lang] += '\r\n\r\n' + des.text else: translated_notes[lang] = des.text notes = json.dumps(translated_notes) # Discipline discipline = self.get_discipline(self.ddi_xml.stdyDscr.stdyInfo.subject) # Dataset lifetime events events = self._get_events(stdy_dscr, authors) # Geographic coverage geo_cover = self.get_geo_coverage(self.ddi_xml) # Temporal coverage temp_start, temp_end = self.get_temporal_coverage(self.ddi_xml) # Citation citation = self._read_value(stdy_dscr + ".citation.biblCit.text", mandatory_field=False) #################################################################### # Flatten rest to 'XPath/path/to/element': 'value' pairs # #################################################################### etree_xml = etree.fromstring(str(self.ddi_xml)) flattened_ddi = importcore.generic_xml_metadata_reader(etree_xml.find('.//{*}docDscr')) xpath_dict = flattened_ddi.getMap() flattened_ddi = importcore.generic_xml_metadata_reader(etree_xml.find('.//{*}stdyDscr')) xpath_dict.update(flattened_ddi.getMap()) existing_package_id = get_package_id_by_pid(primary_pid, u'primary') package_id = existing_package_id if existing_package_id else get_unique_package_id() package_name = pid_to_name(package_id) package_dict = dict( access_application_URL=u'', access_request_URL=unicode(access_request_url), agent=agent, algorithm=u'', # To be implemented straight in 'resources' availability=unicode(availability), contact=[{'name': contact_name, 'email': contact_email, 'URL': contact_URL, 'phone': contact_phone}], direct_download_URL=u'', # To be implemented straight in 'resources discipline=discipline, event=events, geographic_coverage=geo_cover, groups=[], id=package_id, langdis=u'True', # HUOMAA! language=language, license_URL=license_url, license_id=license_id, mimetype=u'', # To be implemented straight in 'resources name=package_name, notes=notes or u'', pids=pids, owner_org=owner_org, resources=[orig_web_page_resource], tag_string=keywords, temporal_coverage_begin=temp_start, temporal_coverage_end=temp_end, title=title, type='dataset', version=version, version_PID='', citation=citation ) package_dict['xpaths'] = xpath_dict # Above line creates: # package_dict = { # 'access_request_url': 'some_url', # # ... # 'xpaths': {'stdyDscr/othrStdyMat.0/relPubl.34': # 'Uskon asia: nuorisobarometri 2006 (2006).'}, # {'stdyD...': 'Some value'}] # } #package_dict['extras'].update(_save_ddi_variables_to_csv(ddi_xml, somepkg)) # Vanhojen koodien järjestys: #_save_original_xml_and_link_as_resources() #_save_ddi_variables_to_csv() #_create_group_based_on_organizations() #_last_statements_to_rewrite() # JuhoL: Set harvest object to some end state and commit if harvest_object is not None: harvest_object.content = None # Should this be flushed? model.Session.flush() #model.repo.commit() return package_dict
def _read(self): project_funder, project_funding, project_name, project_homepage = _get_project_stuff(self.dc) or ('', '', '', '') # Todo! This needs to be improved to use also simple-dc # dc(filter_tag_name_namespace('publisher', ns['dc']), recursive=False) availability, license_id, license_url, access_application_url = _get_rights(self.dc) or ('', '', '', '') if not availability: availability = first(self._get_availability()) uploader = self._get_uploader() data_pids = list(_get_data_pids(self.dc)) tags = [] #for tag in sorted([a.string for a in self.dc('subject', recursive=False)]): # tags.extend(self._resolve_tags(tag)) tags = [a.string for a in self.dc('subject', recursive=False)] transl_json = {} for title in self.dc('title', recursive=False): lang = utils.convert_language(title.get('xml:lang', '').strip()) transl_json[lang] = title.string.strip() title = json.dumps(transl_json) def _get_primary_pid(data_pids): for dpid in data_pids: if dpid.startswith('urn:nbn:fi:csc-ida'): data_pids.remove(dpid) return [dpid] return [] # Create a unified internal harvester format dict unified = dict( # ?=dc('source', recursive=False), # ?=dc('relation', recursive=False), # ?=dc('type', recursive=False), access_application_URL=access_application_url or '', # Todo! Implement access_request_URL='', algorithm=first(_get_algorithm(self.dc)) or '', # TODO: Handle availabilities better availability=availability, checksum=_get_checksum(self.dc) or '', direct_download_URL=first(_get_download(self.dc)) or '', # Todo! Implement discipline='', # Todo! Should be possible to implement with QDC, but not with OAI_DC # evdescr=[], # evtype=[], # evwhen=[], # evwho=[], # Todo! Implement geographic_coverage='', #langtitle=[dict(lang=a.get('xml:lang', ''), value=a.string) for a in self.dc('title', recursive=False)], title=title, language=','.join(sorted([a.string for a in self.dc('language', recursive=False)])), license_URL=license_url or '', license_id=license_id or 'notspecified', # Todo! Using only the first entry, for now contact=[dict(name=name or "", email=email or "", URL=url or "", phone=phone or "") for name, email, phone, url in self._get_maintainer_stuff()], # Todo! IDA currently doesn't produce this, maybe in future # dc('hasFormat', recursive=False) mimetype=self._get_mime_type(), notes=self._read_notes(), # Todo! Using only the first entry, for now # owner=first([a.get('resource') for a in dc('rightsHolder', recursive=False)]) or '', pids=[dict(id=pid, provider=_get_provider(self.bs), type=u'primary') for pid in _get_primary_pid(data_pids)] + [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in data_pids] + [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in self._get_version_pids()] + [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in _get_metadata_pid(self.dc)], agent=[dict(role='author', name=orgauth.get('value', ''), id='', organisation=orgauth.get('org', ''), URL='', fundingid='') for orgauth in _get_org_auth(self.dc)] + [dict(role='contributor', name=contributor.get('value', ''), id='', organisation=contributor.get('org', ''), URL='', fundingid='') for contributor in _get_contributor(self.dc)] + [dict(role='funder', name=first(project_name) or '', id=first(project_name) or '', organisation=first(project_funder) or "", URL=first(project_homepage) or '', fundingid=first(project_funding) or '',)] + [dict(role='owner', name=first([a.get('resource') for a in self.dc('rightsHolder', recursive=False)]) or first(_get_rightsholder(self.dc)) or '', id='', organisation='', URL='', fundingid='')], tag_string=','.join(tags) or '', # Todo! Implement if possible temporal_coverage_begin='', temporal_coverage_end='', type='dataset', uploader=uploader, # Used in smear harvest code to extract variable, station and year values, but is not used when # creating the dataset via API. smear_url=first(_get_download(self.dc, False)) or '', # Todo! This should be more exactly picked version=(self.dc.modified or self.dc.date).string if (self.dc.modified or self.dc.date) else '', # version=dc( # partial(filter_tag_name_namespace, 'modified', ns['dct']), recursive=False)[0].string or dc( # partial(filter_tag_name_namespace, 'date', ns['dc']), recursive=False)[0].string, ) if not unified['language']: unified['langdis'] = 'True' # Create id and name unified['id'] = generate_pid() unified['name'] = pid_to_name(unified['id']) # If primary pid is missing, set package id as primary pid if not any(pid.get('type', None) == u'primary' for pid in unified['pids']): unified['pids'].append(dict(id=unified['id'], type=u'primary', provider=None)) # if not unified['project_name']: # unified['projdis'] = 'True' return unified
def populate_harvest_job(self, harvest_job, set_ids, config, client): # Check if this source has been harvested before previous_job = Session.query(HarvestJob) \ .filter(HarvestJob.source == harvest_job.source) \ .filter(HarvestJob.gather_finished != None) \ .filter(HarvestJob.id != harvest_job.id) \ .order_by(HarvestJob.gather_finished.desc()) \ .limit(1).first() last_time = None if previous_job and previous_job.finished and model.Package.get( harvest_job.source.id ).metadata_modified < previous_job.gather_started: last_time = previous_job.gather_started.isoformat() # Collect package ids package_ids = list( self.get_package_ids(set_ids, config, last_time, client)) log.debug('Identifiers: %s', package_ids) if not self._recreate(harvest_job) and package_ids: converted_identifiers = {} for identifier in package_ids: converted_identifiers[pid_to_name(identifier)] = identifier if identifier.endswith(u'm'): converted_identifiers[pid_to_name( u"%ss" % identifier[0:-1])] = identifier for package in model.Session.query(model.Package).filter( model.Package.name.in_( converted_identifiers.keys())).all(): converted_name = package.name if converted_identifiers[converted_name] not in package_ids: converted_name = "%sm" % converted_name[0:-1] package_ids.remove(converted_identifiers[converted_name]) if previous_job: for previous_error in [ error.guid for error in Session.query(HarvestObject).filter( HarvestObject.harvest_job_id == previous_job.id). filter(HarvestObject.state == 'ERROR').all() ]: if previous_error not in package_ids: package_ids.append(previous_error) try: object_ids = [] if len(package_ids): for package_id in islice( package_ids, config['limit']) if 'limit' in config else package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) log.debug('Object ids: {i}'.format(i=object_ids)) return object_ids else: self._save_gather_error( 'No packages received for URL: {u}'.format( u=harvest_job.source.url), harvest_job) return None except Exception as e: self._save_gather_error('Gather: {e}'.format(e=e), harvest_job) raise
def read_data(self, xml): """ Extract package data from given XML. :param xml: xml element (lxml) :return: dictionary """ cmd = first(xml.xpath('//oai:record/oai:metadata/cmd:CMD', namespaces=self.namespaces)) if cmd is None: raise CmdiReaderException("Unexpected XML format: No CMD -element found") resource_info = cmd.xpath("//cmd:Components/cmd:resourceInfo", namespaces=self.namespaces)[0] if resource_info is None: raise CmdiReaderException("Unexpected XML format: No resourceInfo -element found") metadata_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:identifier/text()") languages = self._text_xpath(cmd, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:languageInfo/cmd:languageId/text()") # convert the descriptions to a JSON string of type {"fin":"kuvaus", "eng","desc"} desc_json = {} for desc in xml.xpath("//cmd:identificationInfo/cmd:description", namespaces=self.namespaces): lang = convert_language(desc.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip()) desc_json[lang] = unicode(desc.text).strip() description = json.dumps(desc_json) # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"} transl_json = {} for title in xml.xpath('//cmd:identificationInfo/cmd:resourceName', namespaces=self.namespaces): lang = convert_language(title.get('{http://www.w3.org/XML/1998/namespace}lang', 'undefined').strip()) transl_json[lang] = title.text.strip() title = json.dumps(transl_json) provider = self.provider version = first(self._text_xpath(resource_info, "//cmd:metadataInfo/cmd:metadataLastDateUpdated/text()")) or "" coverage = first(self._text_xpath(resource_info, "//cmd:corpusInfo/cmd:corpusMediaType/cmd:corpusTextInfo/cmd:timeCoverageInfo/cmd:timeCoverage/text()")) or "" pids = [] primary_pid = '' direct_download_URL = '' access_request_URL = '' access_application_URL = '' # data_identifiers = self._text_xpath(cmd, "//cmd:identificationInfo/cmd:url/text()") for pid in [CmdiReader._language_bank_urn_pid_enhancement(metadata_pid) for metadata_pid in metadata_identifiers]: if 'urn' in pid and not primary_pid: pids.append(dict(id=pid, provider=provider, type='primary')) primary_pid=pid # else: # pids.append(dict(id=pid, provider=provider, type='relation', relation='generalRelation')) # # pids += [dict(id=CmdiReader._language_bank_urn_pid_enhancement(pid), provider=provider, type='relation', # relation='generalRelation') for pid in data_identifiers] license_identifier = CmdiReader._language_bank_license_enhancement(first(self._text_xpath(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licence/text()")) or 'notspecified') availability = CmdiReader._language_bank_availability_from_license(license_identifier) if license_identifier.lower().strip() != 'undernegotiation': if availability == 'direct_download': direct_download_URL = primary_pid if availability == 'access_request': access_request_URL = primary_pid if availability == 'access_application_other': sliced_pid = primary_pid.rsplit('/', 1) if len(sliced_pid) >= 2: access_application_URL = 'https://lbr.csc.fi/web/guest/catalogue?domain=LBR&target=basket&resource=' + sliced_pid[1] temporal_coverage_begin = "" temporal_coverage_end = "" if coverage: split = [item.strip() for item in coverage.split("-")] if len(split) == 2: temporal_coverage_begin = split[0] temporal_coverage_end = split[1] # TODO: Check agent mapping. #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorPerson") #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson") #print "###", _get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson") #print "###", _get_persons(resource_info, "//cmd:contactPerson") #print "###", _get_persons(resource_info, "//cmd:metadataInfo/cmd:metadataCreator") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:licensorOrganization") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization") #print "###", _get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization") contacts = self._persons_as_contact(self._get_persons(resource_info, "//cmd:contactPerson")) agents = [] agents.extend(self._persons_as_agent(self._get_persons(resource_info, "//cmd:distributionInfo/cmd:iprHolderPerson"), 'author')) agents.extend(self._persons_as_agent(self._get_persons(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderPerson"), 'owner')) agents.extend(self._organization_as_agent(self._get_organizations(resource_info, "//cmd:distributionInfo/cmd:iprHolderOrganization"), 'author')) agents.extend(self._organization_as_agent(self._get_organizations(resource_info, "//cmd:distributionInfo/cmd:licenceInfo/cmd:distributionRightsHolderOrganization"), 'owner')) existing_package_id = get_package_id_by_pid(primary_pid, u'primary') package_id = existing_package_id if existing_package_id else get_unique_package_id() result = {'name': pid_to_name(package_id), 'language': ",".join(languages), 'pids': pids, 'version': version, 'notes': description, 'title': title, 'type': 'dataset', 'contact': contacts, 'agent': agents, 'availability': availability, 'direct_download_URL': direct_download_URL, 'access_request_URL': access_request_URL, 'access_application_URL': access_application_URL, 'temporal_coverage_begin': temporal_coverage_begin, 'temporal_coverage_end': temporal_coverage_end, 'license_id': license_identifier, 'license_URL': ''} if not languages: result['langdis'] = u'True' if package_id: result['id'] = package_id # TODO: Ask about distributionAccessMedium # _strip_first(_text_xpath(resource_info, "//cmd:distributionInfo/availability/text()")) # url = _strip_first(_text_xpath(resource_info, "//cmd:identificationInfo/cmd:url/text()")) return result
def _read(self): project_funder, project_funding, project_name, project_homepage = _get_project_stuff( self.dc) or ('', '', '', '') # Todo! This needs to be improved to use also simple-dc # dc(filter_tag_name_namespace('publisher', ns['dc']), recursive=False) availability, license_id, license_url, access_application_url = _get_rights( self.dc) or ('', '', '', '') if not availability: availability = first(self._get_availability()) uploader = self._get_uploader() data_pids = list(_get_data_pids(self.dc)) tags = [] #for tag in sorted([a.string for a in self.dc('subject', recursive=False)]): # tags.extend(self._resolve_tags(tag)) tags = [a.string for a in self.dc('subject', recursive=False)] transl_json = {} for title in self.dc('title', recursive=False): lang = utils.convert_language(title.get('xml:lang', '').strip()) transl_json[lang] = title.string.strip() title = json.dumps(transl_json) def _get_primary_pid(data_pids): for dpid in data_pids: if dpid.startswith('urn:nbn:fi:csc-ida'): data_pids.remove(dpid) return [dpid] return [] # Create a unified internal harvester format dict unified = dict( # ?=dc('source', recursive=False), # ?=dc('relation', recursive=False), # ?=dc('type', recursive=False), access_application_URL=access_application_url or '', # Todo! Implement access_request_URL='', algorithm=first(_get_algorithm(self.dc)) or '', # TODO: Handle availabilities better availability=availability, checksum=_get_checksum(self.dc) or '', direct_download_URL=first(_get_download(self.dc)) or '', # Todo! Implement discipline='', # Todo! Should be possible to implement with QDC, but not with OAI_DC # evdescr=[], # evtype=[], # evwhen=[], # evwho=[], # Todo! Implement geographic_coverage='', #langtitle=[dict(lang=a.get('xml:lang', ''), value=a.string) for a in self.dc('title', recursive=False)], title=title, language=','.join( sorted( [a.string for a in self.dc('language', recursive=False)])), license_URL=license_url or '', license_id=license_id or 'notspecified', # Todo! Using only the first entry, for now contact=[ dict(name=name or "", email=email or "", URL=url or "", phone=phone or "") for name, email, phone, url in self._get_maintainer_stuff() ], # Todo! IDA currently doesn't produce this, maybe in future # dc('hasFormat', recursive=False) mimetype=self._get_mime_type(), notes=self._read_notes(), # Todo! Using only the first entry, for now # owner=first([a.get('resource') for a in dc('rightsHolder', recursive=False)]) or '', pids=[ dict(id=pid, provider=_get_provider(self.bs), type=u'primary') for pid in _get_primary_pid(data_pids) ] + [ dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in data_pids ] + [ dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in self._get_version_pids() ] + [ dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in _get_metadata_pid(self.dc) ], agent=[ dict(role='author', name=orgauth.get('value', ''), id='', organisation=orgauth.get('org', ''), URL='', fundingid='') for orgauth in _get_org_auth(self.dc) ] + [ dict(role='contributor', name=contributor.get('value', ''), id='', organisation=contributor.get('org', ''), URL='', fundingid='') for contributor in _get_contributor(self.dc) ] + [ dict( role='funder', name=first(project_name) or '', id=first(project_name) or '', organisation=first(project_funder) or "", URL=first(project_homepage) or '', fundingid=first(project_funding) or '', ) ] + [ dict(role='owner', name=first([ a.get('resource') for a in self.dc('rightsHolder', recursive=False) ]) or first(_get_rightsholder(self.dc)) or '', id='', organisation='', URL='', fundingid='') ], tag_string=','.join(tags) or '', # Todo! Implement if possible temporal_coverage_begin='', temporal_coverage_end='', type='dataset', uploader=uploader, # Used in smear harvest code to extract variable, station and year values, but is not used when # creating the dataset via API. smear_url=first(_get_download(self.dc, False)) or '', # Todo! This should be more exactly picked version=(self.dc.modified or self.dc.date).string if (self.dc.modified or self.dc.date) else '', # version=dc( # partial(filter_tag_name_namespace, 'modified', ns['dct']), recursive=False)[0].string or dc( # partial(filter_tag_name_namespace, 'date', ns['dc']), recursive=False)[0].string, ) if not unified['language']: unified['langdis'] = 'True' # Create id and name unified['id'] = generate_pid() unified['name'] = pid_to_name(unified['id']) # If primary pid is missing, set package id as primary pid if not any( pid.get('type', None) == u'primary' for pid in unified['pids']): unified['pids'].append( dict(id=unified['id'], type=u'primary', provider=None)) # if not unified['project_name']: # unified['projdis'] = 'True' return unified