コード例 #1
0
 def _fetch_import_set(self, harvest_object, master_data, client, group):
     # Could be genuine fetch or retry of set insertions.
     if 'set' in master_data:
         # Fetch stage.
         args = {self.metadata_prefix_key: self.metadata_prefix_value, 'set': master_data['set']}
         if 'from_' in master_data:
             args['from_'] = self._datetime_from_str(master_data['from_'])
         if 'until' in master_data:
             args['until'] = self._datetime_from_str(master_data['until'])
         ids = []
         try:
             for identity in client.listIdentifiers(**args):
                 ids.append(identity.identifier())
         except NoRecordsMatchError:
             return False  # Ok, empty set. Nothing to do.
         except socket.error:
             errno, errstr = sys.exc_info()[:2]
             self._save_object_error(
                 'Socket error OAI-PMH %s, details:\n%s' % (errno, errstr,),
                 harvest_object, stage='Fetch')
             return False
         except httplib.BadStatusLine:
             self._save_object_error(
                 'Bad HTTP response status line.',
                 harvest_object, stage='Fetch')
             return False
         master_data['record_ids'] = ids
     else:
         log.debug('Reinsert: %s %i' % (master_data['set_name'], len(master_data['record_ids']),))
     # Do not save to DB because we can't.
     # Import stage.
     model.repo.new_revision()
     subg_name = '%s - %s' % (group.name, master_data['set_name'],)
     subgroup = Group.by_name(subg_name)
     if not subgroup:
         subgroup = Group(name=subg_name, description=subg_name)
         setup_default_user_roles(subgroup)
         subgroup.save()
     missed = []
     for ident in master_data['record_ids']:
         pkg_name = self._package_name_from_identifier(ident)
         # Package may have been omitted due to missing metadata.
         pkg = Package.get(pkg_name)
         if pkg:
             subgroup.add_package_by_name(pkg_name)
             subgroup.save()
             if 'set' not in master_data:
                 log.debug('Inserted %s into %s' % (pkg_name, subg_name,))
         else:
             # Either omitted due to missing metadata or fetch error.
             # In the latter case, we want to add record later once the
             # fetch succeeds after retry.
             missed.append(ident)
             if 'set' not in master_data:
                 log.debug('Omitted %s from %s' % (pkg_name, subg_name,))
     if len(missed):
         # Store missing names for retry.
         master_data['record_ids'] = missed
         if 'set' in master_data:
             del master_data['set']  # Omit fetch later.
         harvest_object.content = json.dumps(master_data)
         log.debug('Missed %s %i' % (master_data['set_name'], len(missed),))
     else:
         harvest_object.content = None  # Clear data.
     model.repo.commit()
     return True
コード例 #2
0
ファイル: harvester.py プロジェクト: ilrt/ckanext-oaipmh
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g
              create a CKAN package).
              Note: if this stage creates or updates a package, a reference
              to the package must be added to the HarvestObject.
              Additionally, the HarvestObject must be flagged as current.
            - creating the HarvestObject - Package relation (if necessary)
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        model.repo.new_revision()
        master_data = json.loads(harvest_object.content)
        domain = master_data['domain']
        group = Group.get(domain)
        if not group:
            group = Group(name=domain, description=domain)
        if 'records' in master_data:
            records = master_data['records']
            set_name = master_data['set_name']
            for rec in records:
                identifier, metadata, _ = rec
                if metadata:
                    name = metadata['title'][0] if len(metadata['title'])\
                                                else identifier
                    title = name
                    norm_title = unicodedata.normalize('NFKD', name)\
                                 .encode('ASCII', 'ignore')\
                                 .lower().replace(' ', '_')[:35]
                    slug = ''.join(e for e in norm_title
                                    if e in string.ascii_letters + '_')
                    name = slug
                    creator = metadata['creator'][0]\
                                if len(metadata['creator']) else ''
                    description = metadata['description'][0]\
                                if len(metadata['description']) else ''
                    pkg = Package.by_name(name)
                    if not pkg:
                        pkg = Package(name=name, title=title)
                    extras = {}
                    for met in metadata.items():
                        key, value = met
                        if len(value) > 0:
                            if key == 'subject' or key == 'type':
                                for tag in value:
                                    if tag:
                                        tag = munge_tag(tag[:100])
                                        tag_obj = model.Tag.by_name(tag)
                                        if not tag_obj:
                                            tag_obj = model.Tag(name=tag)
                                        if tag_obj:
                                            pkgtag = model.PackageTag(
                                                                  tag=tag_obj,
                                                                  package=pkg)
                                            Session.add(tag_obj)
                                            Session.add(pkgtag)
                            else:
                                extras[key] = ' '.join(value)
                    pkg.author = creator
                    pkg.author_email = creator
                    pkg.title = title
                    pkg.notes = description
                    pkg.extras = extras
                    pkg.url = \
                    "%s?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc"\
                                % (harvest_object.job.source.url, identifier)
                    pkg.save()
                    harvest_object.package_id = pkg.id
                    Session.add(harvest_object)
                    setup_default_user_roles(pkg)
                    url = ''
                    for ids in metadata['identifier']:
                        if ids.startswith('http://'):
                            url = ids
                    title = metadata['title'][0] if len(metadata['title'])\
                                                    else ''
                    description = metadata['description'][0]\
                                    if len(metadata['description']) else ''
                    pkg.add_resource(url, description=description, name=title)
                    group.add_package_by_name(pkg.name)
                    subg_name = "%s - %s" % (domain, set_name)
                    subgroup = Group.by_name(subg_name)
                    if not subgroup:
                        subgroup = Group(name=subg_name, description=subg_name)
                    subgroup.add_package_by_name(pkg.name)
                    Session.add(group)
                    Session.add(subgroup)
                    setup_default_user_roles(group)
                    setup_default_user_roles(subgroup)
            model.repo.commit()
        else:
            self._save_object_error('Could not receive any objects from fetch!'
                                    , harvest_object, stage='Import')
            return False
        return True