def _fetch_import_set(self, harvest_object, master_data, client, group): # Could be genuine fetch or retry of set insertions. if 'set' in master_data: # Fetch stage. args = {self.metadata_prefix_key: self.metadata_prefix_value, 'set': master_data['set']} if 'from_' in master_data: args['from_'] = self._datetime_from_str(master_data['from_']) if 'until' in master_data: args['until'] = self._datetime_from_str(master_data['until']) ids = [] try: for identity in client.listIdentifiers(**args): ids.append(identity.identifier()) except NoRecordsMatchError: return False # Ok, empty set. Nothing to do. except socket.error: errno, errstr = sys.exc_info()[:2] self._save_object_error( 'Socket error OAI-PMH %s, details:\n%s' % (errno, errstr,), harvest_object, stage='Fetch') return False except httplib.BadStatusLine: self._save_object_error( 'Bad HTTP response status line.', harvest_object, stage='Fetch') return False master_data['record_ids'] = ids else: log.debug('Reinsert: %s %i' % (master_data['set_name'], len(master_data['record_ids']),)) # Do not save to DB because we can't. # Import stage. model.repo.new_revision() subg_name = '%s - %s' % (group.name, master_data['set_name'],) subgroup = Group.by_name(subg_name) if not subgroup: subgroup = Group(name=subg_name, description=subg_name) setup_default_user_roles(subgroup) subgroup.save() missed = [] for ident in master_data['record_ids']: pkg_name = self._package_name_from_identifier(ident) # Package may have been omitted due to missing metadata. pkg = Package.get(pkg_name) if pkg: subgroup.add_package_by_name(pkg_name) subgroup.save() if 'set' not in master_data: log.debug('Inserted %s into %s' % (pkg_name, subg_name,)) else: # Either omitted due to missing metadata or fetch error. # In the latter case, we want to add record later once the # fetch succeeds after retry. missed.append(ident) if 'set' not in master_data: log.debug('Omitted %s from %s' % (pkg_name, subg_name,)) if len(missed): # Store missing names for retry. master_data['record_ids'] = missed if 'set' in master_data: del master_data['set'] # Omit fetch later. harvest_object.content = json.dumps(master_data) log.debug('Missed %s %i' % (master_data['set_name'], len(missed),)) else: harvest_object.content = None # Clear data. model.repo.commit() return True
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package must be added to the HarvestObject. Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' model.repo.new_revision() master_data = json.loads(harvest_object.content) domain = master_data['domain'] group = Group.get(domain) if not group: group = Group(name=domain, description=domain) if 'records' in master_data: records = master_data['records'] set_name = master_data['set_name'] for rec in records: identifier, metadata, _ = rec if metadata: name = metadata['title'][0] if len(metadata['title'])\ else identifier title = name norm_title = unicodedata.normalize('NFKD', name)\ .encode('ASCII', 'ignore')\ .lower().replace(' ', '_')[:35] slug = ''.join(e for e in norm_title if e in string.ascii_letters + '_') name = slug creator = metadata['creator'][0]\ if len(metadata['creator']) else '' description = metadata['description'][0]\ if len(metadata['description']) else '' pkg = Package.by_name(name) if not pkg: pkg = Package(name=name, title=title) extras = {} for met in metadata.items(): key, value = met if len(value) > 0: if key == 'subject' or key == 'type': for tag in value: if tag: tag = munge_tag(tag[:100]) tag_obj = model.Tag.by_name(tag) if not tag_obj: tag_obj = model.Tag(name=tag) if tag_obj: pkgtag = model.PackageTag( tag=tag_obj, package=pkg) Session.add(tag_obj) Session.add(pkgtag) else: extras[key] = ' '.join(value) pkg.author = creator pkg.author_email = creator pkg.title = title pkg.notes = description pkg.extras = extras pkg.url = \ "%s?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc"\ % (harvest_object.job.source.url, identifier) pkg.save() harvest_object.package_id = pkg.id Session.add(harvest_object) setup_default_user_roles(pkg) url = '' for ids in metadata['identifier']: if ids.startswith('http://'): url = ids title = metadata['title'][0] if len(metadata['title'])\ else '' description = metadata['description'][0]\ if len(metadata['description']) else '' pkg.add_resource(url, description=description, name=title) group.add_package_by_name(pkg.name) subg_name = "%s - %s" % (domain, set_name) subgroup = Group.by_name(subg_name) if not subgroup: subgroup = Group(name=subg_name, description=subg_name) subgroup.add_package_by_name(pkg.name) Session.add(group) Session.add(subgroup) setup_default_user_roles(group) setup_default_user_roles(subgroup) model.repo.commit() else: self._save_object_error('Could not receive any objects from fetch!' , harvest_object, stage='Import') return False return True