def sanitize_keyword(s, strict=True):
    """
    Make string compatible for usage as CKAN keywords:
    keywords rules are not very clear. It seems at first it did not support anything out of lowercased characters and _-
    but is seems that now it supports well spaces and uppercased chars, as well as accentuated characters.
    or -_

    This is an alternative to setting {"clean_tags": true} in the harvesting configuration,which is a little bit more destructive
    (although maybe quite similar to the strict option)
    :param s:
    :return:
    """

    if not s:
        return ''

    s = re.sub(r'\'\s+', ' ', s)  # remove duplicate spaces
    s = s.strip()  # remove trailing spaces
    s = re.sub(u'\'', ' ', s)  # Change single quote to space
    #s = re.sub(r'[\s]', '_', s)
    if strict:
        # should ensure compiancy with ckan validators requirements (as announced)
        #s = unidecode.unidecode(s)  # remove accents and keep to closest possible ascii match
        s = substitute_ascii_equivalents(
            s)  # remove accents and keep to closest possible ascii match
        pattern = u'[^\w\-]'  # set a more strict match pattern
        s = re.sub(pattern, '-', s, re.UNICODE).lower()  # all lowercased
    else:
        # seems sufficient in most cases
        pattern = u'[^a-zA-Z0-9_àâäôéèëêïîçùûüÿæœÀÂÄÔÉÈËÊÏΟÇÙÛÜÆŒ \-]'  # Accept accents
        s = re.sub(pattern, '-', s,
                   re.UNICODE)  # don't lowercase systematically
    return s
Esempio n. 2
0
    def migrate(self):
        '''

        '''
        related_items = get_action('related_list')(data_dict={})

        # preflight:
        # related items must have unique titles before migration
        related_titles = [i['title'] for i in related_items]
        # make a list of duplicate titles
        duplicate_titles = self._find_duplicates(related_titles)
        if duplicate_titles:
            print(
                """All Related Items must have unique titles before migration. The following
Related Item titles are used more than once and need to be corrected before
migration can continue. Please correct and try again:"""
            )
            for i in duplicate_titles:
                print(i)
            return

        for related in related_items:
            existing_showcase = get_action('package_search')(
                data_dict={'fq': '+dataset_type:showcase original_related_item_id:{0}'.format(related['id'])})
            normalized_title = substitute_ascii_equivalents(related['title'])
            if existing_showcase['count'] > 0:
                print('Showcase for Related Item "{0}" already exists.'.format(
                    normalized_title))
            else:
                data_dict = {
                    'original_related_item_id': related.get('id'),
                    'title': related.get('title'),
                    'name': munge_title_to_name(related.get('title')),
                    'notes': related.get('description'),
                    'image_url': related.get('image_url'),
                    'url': related.get('url'),
                    'tags': [{"name": related.get('type').lower()}]
                }
                # make the showcase
                try:
                    new_showcase = get_action('ckanext_showcase_create')(
                        data_dict=data_dict)
                except Exception as e:
                    print('There was a problem migrating "{0}": {1}'.format(
                        normalized_title, e))
                else:
                    print('Created Showcase from the Related Item "{0}"'.format(normalized_title))

                    # make the showcase_package_association, if needed
                    try:
                        related_pkg_id = self._get_related_dataset(
                            related['id'])
                        if related_pkg_id:
                            get_action('ckanext_showcase_package_association_create')(
                                data_dict={'showcase_id': new_showcase['id'],
                                           'package_id': related_pkg_id})
                    except Exception as e:
                        print('There was a problem creating the showcase_package_association for "{0}": {1}'.format(
                            normalized_title, e))
Esempio n. 3
0
 def munge_tag(tag):
     tag = substitute_ascii_equivalents(tag)
     tag = tag.lower().strip()
     tag = re.sub(r'[^a-zA-Z0-9\- ]', '', tag).replace(' ', '-')
     tag = _munge_to_length(tag, model.MIN_TAG_LENGTH, model.MAX_TAG_LENGTH)
     return tag
Esempio n. 4
0
 def munge_tag(tag):
     tag = substitute_ascii_equivalents(tag)
     tag = tag.lower().strip()
     tag = re.sub(r'[^a-zA-Z0-9\- ]', '', tag).replace(' ', '-')
     tag = _munge_to_length(tag, model.MIN_TAG_LENGTH, model.MAX_TAG_LENGTH)
     return tag
Esempio n. 5
0
def munge_tag(tag):
    tag = substitute_ascii_equivalents(tag)
    tag = tag.lower().strip()
    return re.sub(r'[^a-zA-Z0-9 -]', '', tag).replace(' ', '-')
Esempio n. 6
0
def munge_tag(tag):
    tag = substitute_ascii_equivalents(tag)
    tag = tag.lower().strip()
    return re.sub(r'[^a-zA-Z0-9 -]', '', tag).replace(' ', '-')
Esempio n. 7
0
def munge_tag(tag):
    tag = substitute_ascii_equivalents(tag)
    tag = tag.lower().strip()
    return re.sub(r"[^a-zA-Z0-9 -]", "", tag).replace(" ", "-")
Esempio n. 8
0
        except ContentFetchError, e:
            self._save_gather_error('%r' % e.message, harvest_job)
            return False
        except KeyError, e:
            self._save_gather_error('Failed to parse response: %r' % e, harvest_job)
            return False
        #members = self.get_xroad_catalog("http://localhost:9090/rest-gateway-0.0.8-SNAPSHOT/Consumer/catalog", "2011-01-01")
        #file = open(os.path.join(os.path.dirname(__file__), '../tests/response.json'))
        #members = json.load(file)

        object_ids = []
        for member in members:
            log.info(json.dumps(member))
            #log.info(type(member['subsystems']['subsystem']))
            # Create organization id
            org_id = substitute_ascii_equivalents(unicode(member.get('xRoadInstance', '')) + '.' + unicode(member.get('memberClass', '')) + '.' + unicode(member.get('memberCode', '')))

            if member['subsystems'] and (type(member['subsystems']['subsystem']) is list):

                org = self._create_or_update_organization({'id': org_id, 'name': member['name'], 'created': member['created'], 'changed': member['changed'], 'removed': member.get('removed', None)}, harvest_job)
                for subsystem in member['subsystems']['subsystem']:

                    # Generate GUID
                    guid = substitute_ascii_equivalents(unicode(member.get('xRoadInstance', '')) + '.' + unicode(member.get('memberClass', '')) + '.' + unicode(member.get('memberCode', '')) + '.' + unicode(subsystem.get('subsystemCode', '')))



                    # Create harvest object
                    obj = HarvestObject(guid=guid, job=harvest_job,
                                        content=json.dumps({
                                            'owner': org,
Esempio n. 9
0
        object_ids = []
        for member in members:
            if isinstance(member, basestring):
                continue

            # if there is only 1 subsystem, wrap it with list
            if member['subsystems'] and (type(
                    member['subsystems']['subsystem']) is dict):
                member['subsystems']['subsystem'] = [
                    member['subsystems']['subsystem']
                ]

            # Create organization id
            org_id = substitute_ascii_equivalents(u'.'.join(
                unicode(member.get(p, ''))
                for p in ('xRoadInstance', 'memberClass', 'memberCode')))

            org = self._create_or_update_organization(
                {
                    'id': org_id,
                    'name': member['name'],
                    'created': member['created'],
                    'changed': member['changed'],
                    'removed': member.get('removed', None)
                }, harvest_job)

            if org is None:
                self._save_gather_error(
                    'Failed to create organization with id: %s and name: %s' %
                    (org_id, member['name']), harvest_job)
Esempio n. 10
0
 def read_data(self, id, resource_id):
     res = Resource.get(resource_id)
     pkg = Package.get(id)
     c.pkg_dict = pkg.as_dict()
     c.package = pkg
     c.resource = get_action('resource_show')({'model': model},
                                                  {'id': resource_id})
     label = res.url.split(config.get('ckan.site_url') + '/storage/f/')[-1]
     label = urllib2.unquote(label)
     ofs = get_ofs()
     try:
         furl = ofs.get_url(BUCKET, label).split('file://')[-1]
     except FileNotFoundException:
         h.flash_error(_('Cannot do data mining on remote resource!'))
         url = h.url_for(controller='package', action='resource_read',
                         id=id, resource_id=resource_id)
         return redirect(url)
     wordstats = {}
     ret = {}
     if res.format in ('TXT', 'txt'):
         wdsf, wdspath = tempfile.mkstemp()
         os.write(wdsf, "%s\nmetadata description title information" % furl)
         with os.fdopen(wdsf, 'r') as wordfile:
             preproc = orngText.Preprocess()
             table = orngText.loadFromListWithCategories(wdspath)
             data = orngText.bagOfWords(table, preprocessor=preproc)
             words = orngText.extractWordNGram(data, threshold=10.0, measure='MI')
         for i in range(len(words)):
             d = words[i]
             wordstats = d.get_metas(str)
         for k, v in wordstats.items():
             if v.value > 10.0:
                 ret[unicode(k, 'utf8')] = v.value
         from operator import itemgetter
         c.data_tags = sorted(ret.iteritems(), key=itemgetter(1), reverse=True)[:30]
         os.remove(wdspath)
         for i in range(len(data)):
                 d = words[i]
                 wordstats = d.get_metas(str)
         words = []
         for k, v in wordstats.items():
             words.append(k)
         model.repo.new_revision()
         if not 'autoextracted_description' in pkg.extras:
             pkg.extras['autoextracted_description'] = ' '.join(words)
         pkg.save()
         return render('datamining/read.html')
     elif res.format in ('odt', 'doc', 'xls', 'ods', 'odp', 'ppt', 'doc', 'html'):
         textfd, textpath = convert_to_text(res, furl)
         if not textpath:
             h.flash_error(_('This file could not be mined for any data!'))
             os.close(textfd)
             return render('datamining/read.html')
         else:
             wdsf, wdspath = tempfile.mkstemp()
             os.write(wdsf, "%s\nmetadata description title information" % textpath)
             preproc = orngText.Preprocess()
             table = orngText.loadFromListWithCategories(wdspath)
             data = orngText.bagOfWords(table, preprocessor=preproc)
             words = orngText.extractWordNGram(data, threshold=10.0, measure='MI')
             for i in range(len(words)):
                 d = words[i]
                 wordstats = d.get_metas(str)
             for k, v in wordstats.items():
                 if v.value > 10.0:
                     ret[unicode(k, 'utf8')] = v.value
             from operator import itemgetter
             c.data_tags = sorted(ret.iteritems(), key=itemgetter(1), reverse=True)[:30]
             os.close(textfd)
             os.close(wdsf)
             os.remove(wdspath)
             os.remove(textpath)
             for i in range(len(data)):
                 d = words[i]
                 wordstats = d.get_metas(str)
             words = []
             for k, v in wordstats.items():
                 log.debug(k)
                 words.append(substitute_ascii_equivalents(k))
             model.repo.new_revision()
             if not 'autoextracted_description' in pkg.extras:
                 pkg.extras['autoextracted_description'] = ' '.join(words)
             pkg.save()
             return render('datamining/read.html')
     else:
         h.flash_error(_('This metadata document is not in proper format for data mining!'))
         url = h.url_for(controller='package', action='resource_read',
                         id=id, resource_id=resource_id)
         return redirect(url)
        # Member = organization
        # Subsystem = package = API
        # Service = resource = WSDL

        object_ids = []
        for member in members:
            if isinstance(member, basestring):
                continue

            # if there is only 1 subsystem, wrap it with list
            if member['subsystems'] and (type(member['subsystems']['subsystem']) is dict):
                member['subsystems']['subsystem'] = [member['subsystems']['subsystem']]

            # Create organization id
            org_id = substitute_ascii_equivalents(u'.'.join(unicode(member.get(p, ''))
                for p in ('xRoadInstance', 'memberClass', 'memberCode')))

            org = self._create_or_update_organization({
                'id': org_id,
                'name': member['name'],
                'created': member['created'],
                'changed': member['changed'],
                'removed': member.get('removed', None)
                }, harvest_job)

            if org is None:
                continue

            if self._organization_has_wsdls(member):
                for subsystem in member['subsystems']['subsystem']: