Exemple #1
0
def dataproxy_resource_update(context, data_dict=None):
    """
    Intercepts default resource_update action and encrypts password for dataproxy type resources
    Args:
        context: Request context.
        data_dict: Parsed request parameters.
    Returns:
        see get_action('resource_update').
    Raises:
        Exception: if ckan.dataproxy.secret configuration not set.
    """
    #If not set, default to empty string
    data_dict['url_type'] = data_dict.get('url_type', '')
    url_type = data_dict['url_type']

    if url_type == 'dataproxy':
        secret = config.get('ckan.dataproxy.secret', False)
        if not secret:
            raise Exception('ckan.dataproxy.secret must be defined to encrypt passwords')
        #replace password with a _password_ placeholder
        password = data_dict.get('db_password', '')
        if password == '':
            #we don't want to overwrite existing password with empty string
            resource = Resource.get(data_dict['id'])
            data_dict['db_password'] = resource.extras['db_password']
        else:
            data_dict['url'] = data_dict['url'].replace(password, '_password_')
            #encrypt db_password
            data_dict['db_password'] = hexlify(encrypt(secret, password))

    site_url = config.get('ckan.site_url', '127.0.0.1')
    data_dict['url'] = '{0}/api/3/action/datastore_search?resource_id={1}&downloaded=true'.format(site_url, data_dict['id'])

    return orig_resource_update(context, data_dict)
def datastore_search(original_action, context, data_dict):
    '''
    Execute a datastore search using the custom method in dataproxy
    '''
    log.debug("datastore_search: context: {0} data_dict: {1}".format(context, data_dict))
    logic.check_access('datastore_search', context)

    resource = Resource.get(data_dict['resource_id'])
    
    if resource is not None and resource.url_type == 'dataproxy':
        # Discard sorting and only return 10 rows cuz it's a data preview
        data_dict['sort'] = None
        data_dict['limit'] = 10

        # execute search
        results=json.loads(SearchController().dataproxy_search(data_dict, resource))
        log.debug("datastore_search:  results: {0}".format(results))
        
        # set total to limit
        results['result']['total']=data_dict['limit']    

        # Hack in an _id field because datatables expects it. Add a _id value to each row
        results['result']['fields']=[{"id":"_id", "type": "BIGINT"}] + results['result']['fields']
        for i in range(0, len(results['result']['records'])):
            results['result']['records'][i]['_id'] = i+1
        return results['result']
    
    #Default action otherwise
    return original_action(context, data_dict)
Exemple #3
0
def dataproxy_resource_update(context, data_dict=None):
    """
    Intercepts default resource_update action and encrypts password for dataproxy type resources
    Args:
        context: Request context.
        data_dict: Parsed request parameters.
    Returns:
        see get_action('resource_update').
    Raises:
        Exception: if ckan.dataproxy.secret configuration not set.
    """
    #If not set, default to empty string
    data_dict['url_type'] = data_dict.get('url_type', '')
    url_type = data_dict['url_type']

    if url_type == 'dataproxy':
        secret = config.get('ckan.dataproxy.secret', False)
        if not secret:
            raise Exception(
                'ckan.dataproxy.secret must be defined to encrypt passwords')
        #replace password with a _password_ placeholder
        password = data_dict.get('db_password', '')
        if password == '':
            #we don't want to overwrite existing password with empty string
            resource = Resource.get(data_dict['id'])
            data_dict['db_password'] = resource.extras['db_password']
        else:
            data_dict['url'] = data_dict['url'].replace(password, '_password_')
            #encrypt db_password
            data_dict['db_password'] = hexlify(encrypt(secret, password))

    return orig_resource_update(context, data_dict)
Exemple #4
0
def import_collection_to_package(params, id):
    """
    Import a collection to dataset. Does not import whole file data but
    rather the metadata.
    """
    from irods import irodsCollection
    path = params['path']
    pkg = Package.get(id)
    conn = get_connection_from_params(params)
    if (conn):
        coll = irodsCollection(conn, path)
        from irods import iRodsOpen
        rev = model.repo.new_revision()
        i = 0
        for obj in coll.getObjects():
            extras = {}
            fname, _ = obj
            fpath = "%s/%s" % (coll.getCollName(), fname)
            f = iRodsOpen(conn, fpath, 'r')
            if f:
                i += 1
                res = Resource.by_name(fname)
                if not res:
                    res = Resource(url = '', name=fname, extras=extras, \
                                   resource_type='file')
                for met in f.getUserMetadata():
                    key, value, _ = met
                    extras[key] = value
                res.extras = extras
                resgrp = pkg.resource_groups[0]
                resgrp.resources.append(res)
                Session.add(res)
                Session.add(resgrp)
                rev.message = "Update from iRODS, matched file %s" % fname
        for met in coll.getUserMetadata():
            key, value, _ = met
            pkg.extras[key] = value
        Session.add(pkg)
        model.repo.commit()
        conn.disconnect()
        h.flash_success("iRODS import to dataset OK! Imported %s resources." %
                        i)
    else:
        h.flash_error("Could not connect to iRODS!")
    h.redirect_to(controller='package', action='read', id=id)
def import_collection_to_package(params, id):
    """
    Import a collection to dataset. Does not import whole file data but
    rather the metadata.
    """
    from irods import irodsCollection
    path = params['path']
    pkg = Package.get(id)
    conn = get_connection_from_params(params)
    if (conn):
        coll = irodsCollection(conn, path)
        from irods import iRodsOpen
        rev = model.repo.new_revision()
        i = 0
        for obj in coll.getObjects():
            extras = {} 
            fname, _ = obj
            fpath = "%s/%s" % (coll.getCollName(), fname) 
            f = iRodsOpen(conn, fpath, 'r')
            if f:
                i += 1
                res = Resource.by_name(fname)
                if not res:
                    res = Resource(url = '', name=fname, extras=extras, \
                                   resource_type='file')
                for met in f.getUserMetadata():
                    key, value, _ = met
                    extras[key] = value
                res.extras = extras
                resgrp = pkg.resource_groups[0]
                resgrp.resources.append(res)
                Session.add(res)
                Session.add(resgrp)
                rev.message = "Update from iRODS, matched file %s" % fname
        for met in coll.getUserMetadata():
            key, value, _ = met
            pkg.extras[key] = value
        Session.add(pkg)
        model.repo.commit()
        conn.disconnect()
        h.flash_success("iRODS import to dataset OK! Imported %s resources." % i)
    else:
        h.flash_error("Could not connect to iRODS!")
    h.redirect_to(controller='package', action='read', id=id)
Exemple #6
0
    def search_action(self):
        """Routes dataproxy type resources to dataproxy_search method, else performs 'datastore_search' action"""
        #TODO: No access control checks for dataproxy resources!
        request_data = self._get_request_data(try_url_params=True)
        resource = Resource.get(request_data['resource_id'])
        if resource is not None and resource.url_type == 'dataproxy':
            pylons.response.headers['Content-Type'] = 'application/json;charset=utf-8'
            return self.dataproxy_search(request_data, resource)

        #Default action otherwise
        return self.action('datastore_search', ver=3)
Exemple #7
0
    def search_action(self):
        """Routes dataproxy type resources to dataproxy_search method, else performs 'datastore_search' action"""
        #TODO: No access control checks for dataproxy resources!
        request_data = self._get_request_data(try_url_params=True)

        downloaded = False
        if 'downloaded' in request_data:
            downloaded = str(request_data['downloaded']).upper() == 'TRUE'

        resource = Resource.get(request_data['resource_id'])
        if resource is not None and resource.url_type == 'dataproxy':
            if (downloaded):
                pylons.response.headers['Content-Type'] = 'text/csv'
                pylons.response.headers[
                    'Content-Disposition'] = 'attachment;filename="{0}.{1}"'.format(
                        resource.name, resource.format)
                datas = json.loads(
                    self.dataproxy_search(request_data, resource))
                result = datas['result']

                tmp = ''
                fields = result['fields']
                for val in fields:
                    field = val['id']
                    if isinstance(field, unicode):
                        tmp += field.encode('utf-8') + ','
                    else:
                        tmp += str(field) + ','
                records = result['records']
                for row in records:
                    tmp += '\n'
                    for val in fields:
                        if val['id'] in row:
                            cell = row[val['id']]
                            if cell is not None:
                                if isinstance(cell, unicode):
                                    tmp += '"' + cell.encode('utf-8') + '",'
                                else:
                                    tmp += '"' + str(cell) + '",'
                            else:
                                tmp += ','
                        else:
                            tmp += ','

                return str(tmp)
            else:
                pylons.response.headers[
                    'Content-Type'] = 'application/json;charset=utf-8'
                return self.dataproxy_search(request_data, resource)

        #Default action otherwise
        return self.action('datastore_search', ver=3)
Exemple #8
0
    def search_action(self):
        """Routes dataproxy type resources to dataproxy_search method, else performs 'datastore_search' action"""
        #TODO: No access control checks for dataproxy resources!
        request_data = self._get_request_data(try_url_params=True)
        log.info('{}'.format(request_data))
        if 'resource_id' in request_data:
            resource = Resource.get(request_data['resource_id'])
            if resource is not None and resource.url_type == 'dataproxy':
                pylons.response.headers['Content-Type'] = 'application/json;charset=utf-8'
                return self.dataproxy_search(request_data, resource)

        #Default action otherwise
        return self.action('datastore_search', ver=3)
Exemple #9
0
def update_res_license(context, res_dict, license_id):
    session = context['session']
    resource_id = res_dict['id']
    log.debug(u'Updating license for resource {}: {}'.format(resource_id, license_id))
    resource = Resource.get(res_dict['id'])
    res_license = session.query(CeonResourceLicense).filter(CeonResourceLicense.resource_id == resource.id).first()
    if res_license:
        log.debug(u'Deleting license res_license: {}'.format(res_license))
        session.delete(res_license)
    new_res_license = CeonResourceLicense(resource_id = resource.id, license_id = license_id)
    session.merge(new_res_license)
    log.debug(u'Created license res_license: {}'.format(new_res_license))
    return new_res_license
Exemple #10
0
    def search_action(self):
        """Routes dataproxy type resources to dataproxy_search method, else performs 'datastore_search' action"""
        #TODO: No access control checks for dataproxy resources!
        request_data = self._get_request_data(try_url_params=True)

        downloaded = False
        if 'downloaded' in request_data:
            downloaded = str(request_data['downloaded']).upper() == 'TRUE'

        resource = Resource.get(request_data['resource_id'])
        if resource is not None and resource.url_type == 'dataproxy':
            if(downloaded):
                pylons.response.headers['Content-Type'] = 'text/csv'
                pylons.response.headers['Content-Disposition'] = 'attachment;filename="{0}.{1}"'.format(resource.name, resource.format)
                datas = json.loads(self.dataproxy_search(request_data, resource))
                result = datas['result']

                tmp = ''
                fields = result['fields']
                for val in fields:
					field = val['id']
					if isinstance(field, unicode):
						tmp += field.encode('utf-8') + ','
					else:
						tmp += str(field) + ','
                records = result['records']
                for row in records:
                    tmp += '\n'
                    for val in fields:
                        if val['id'] in row:
                            cell = row[val['id']]
                            if cell is not None:
								if isinstance(cell, unicode):
									tmp += '"' + cell.encode('utf-8') + '",'
								else:
									tmp += '"' + str(cell) + '",'
                            else:
                                tmp += ','
                        else:
                            tmp += ','

                return str(tmp)
            else:
                pylons.response.headers['Content-Type'] = 'application/json;charset=utf-8'
                return self.dataproxy_search(request_data, resource)

        #Default action otherwise
        return self.action('datastore_search', ver=3)
 def view(self, id):
     """
     Renders a form for iRODS resource import, if it receives a POST request,
     it simply calls sync_irods function in order to do the actual import.
     """
     res = Resource.get(id)
     context = {'model':model, 'user': c.user or c.author, 'resource': res }
     try:
         check_access('resource_update', context, {'id' : id})
     except NotAuthorized:
         abort(401, _('Not authorized to see this page'))
     if ('save' in request.params):
         sync_irods(request.params, id)
     c.resource_name = res.name
     c.resource_id = res.id
     return render('ckanext/irods/irods.html')
Exemple #12
0
def update_res_license(context, res_dict, license_id):
    session = context['session']
    resource_id = res_dict['id']
    log.debug(u'Updating license for resource {}: {}'.format(resource_id, license_id))
    resource = Resource.get(res_dict['id'])
    res_license = session.query(CeonResourceLicense).filter(CeonResourceLicense.resource_id == resource.id).first()
    if res_license:
        res_license.license_id = license_id
        log.debug(u'Updated license res_license: {}'.format(res_license))
        session.merge(res_license)
        return res_license
    else:
        new_res_license = CeonResourceLicense(resource_id = resource.id, license_id = license_id)
        session.merge(new_res_license)
        log.debug(u'Created license res_license: {}'.format(new_res_license))
        return new_res_license
Exemple #13
0
 def view(self, id):
     """
     Renders a form for iRODS resource import, if it receives a POST request,
     it simply calls sync_irods function in order to do the actual import.
     """
     res = Resource.get(id)
     context = {'model': model, 'user': c.user or c.author, 'resource': res}
     try:
         check_access('resource_update', context, {'id': id})
     except NotAuthorized:
         abort(401, _('Not authorized to see this page'))
     if ('save' in request.params):
         sync_irods(request.params, id)
     c.resource_name = res.name
     c.resource_id = res.id
     return render('ckanext/irods/irods.html')
    def test_extractor_extract_update_ignored_format(self, send_task):
        """
        extractor_extract for a resource with updated, ignored format.
        """
        res_dict = factories.Resource(format='pdf')
        send_task.reset_mock()
        fake_process(res_dict)

        resource = Resource.get(res_dict['id'])
        resource.format = 'foo'
        resource.save()

        result = call_action('extractor_extract', id=res_dict['id'])
        assert_equal(result['status'], 'ignored', 'Wrong state')
        assert_true(result['task_id'] is None, 'Unexpected task ID')
        assert_equal(send_task.call_count, 0,
                     'Wrong number of extraction tasks.')
        assert_no_metadata(res_dict)
    def test_extractor_extract_update_ignored_format(self, send_task):
        """
        extractor_extract for a resource with updated, ignored format.
        """
        res_dict = factories.Resource(format='pdf')
        send_task.reset_mock()
        fake_process(res_dict)

        resource = Resource.get(res_dict['id'])
        resource.format = 'foo'
        resource.save()

        result = call_action('extractor_extract', id=res_dict['id'])
        assert_equal(result['status'], 'ignored', 'Wrong state')
        assert_true(result['task_id'] is None, 'Unexpected task ID')
        assert_equal(send_task.call_count, 0,
                     'Wrong number of extraction tasks.')
        assert_no_metadata(res_dict)
    def test_extractor_extract_update_indexed_format(self, send_task):
        """
        extractor_extract for a resource with updated, indexed format.
        """
        res_dict = factories.Resource(format='pdf')
        send_task.reset_mock()
        fake_process(res_dict)

        resource = Resource.get(res_dict['id'])
        resource.format = 'doc'
        resource.save()

        result = call_action('extractor_extract', id=res_dict['id'])
        assert_equal(result['status'], 'update', 'Wrong state')
        assert_false(result['task_id'] is None, 'Missing task ID')
        assert_equal(result['task_id'],
                     get_metadata(res_dict).task_id, 'Task IDs differ.')
        assert_equal(send_task.call_count, 1,
                     'Wrong number of extraction tasks.')
    def test_extractor_extract_update_indexed_format(self, send_task):
        """
        extractor_extract for a resource with updated, indexed format.
        """
        res_dict = factories.Resource(format='pdf')
        send_task.reset_mock()
        fake_process(res_dict)

        resource = Resource.get(res_dict['id'])
        resource.format = 'doc'
        resource.save()

        result = call_action('extractor_extract', id=res_dict['id'])
        assert_equal(result['status'], 'update', 'Wrong state')
        assert_false(result['task_id'] is None, 'Missing task ID')
        assert_equal(result['task_id'], get_metadata(res_dict).task_id,
                     'Task IDs differ.')
        assert_equal(send_task.call_count, 1,
                     'Wrong number of extraction tasks.')
Exemple #18
0
def update_resource_url(context, res_dict):
    if not 'url' in res_dict or not res_dict['url']:
        return res_dict
    if not 'url_type' in res_dict or 'upload' != res_dict['url_type']:
        return res_dict
    log.debug(u"Updating resource {} url {}".format(res_dict['id'], res_dict['url']))
    res_url = remove_locales_from_url(res_dict['url'])
    log.debug(u"new url {}".format(res_url))
    if res_dict['url'] != res_url:
        log.debug(u"here 1")
        res_dict['url'] = res_url
        session = context['session']
        res = Resource.get(res_dict['id'])
        log.debug(u"here 2 {}".format(res))
        if not res:
            raise Exception(u'Resource "{}" not found'.format(res_dict['id']))
        res.url = res_url
        session.merge(res)
        log.debug(u"here 3 {}".format(res))
    return res_dict
Exemple #19
0
    def _get_ids(self, only_with_metadata=False):
        """
        Get list of resource IDs from command line arguments.

        Returns the specific IDs listed or all IDs if ``all`` was passed.

        If ``only_with_metadata`` is true and ``all`` was passed then only
        IDs of resources which have metadata are returned.
        """
        from ckan.plugins import toolkit
        if len(self.args) < 1:
            _error('Missing argument. Specify one or more resource IDs '
                   + 'or "all".')
        if len(self.args) == 1 and self.args[0].lower() == 'all':
            if only_with_metadata:
                return sorted(toolkit.get_action('extractor_list')({}, {}))
            else:
                from ckan.model import Resource
                return sorted(r.id for r in Resource.active())
        else:
            return self.args[:]
Exemple #20
0
    def test_link_and_map_shown(self):

        name = u'annakarenina'

        wms_url = 'http://maps.bgs.ac.uk/ArcGIS/services/BGS_Detailed_Geology/MapServer/WMSServer?'
        rev = model.repo.new_revision()
        pkg = Package.get(name)
        pr = Resource(url=wms_url, format='WMS')
        pkg.resources.append(pr)
        pkg.save()
        model.repo.commit_and_remove()
        # Load the dataset page and check if link appears
        offset = url_for(controller='package', action='read', id=name)
        res = self.app.get(offset)

        assert 'View available WMS layers' in res, res

        # Load the dataset map preview page and check if libraries are loaded
        offset = '/dataset/%s/map' % name
        res = self.app.get(offset)
        assert '<script type="text/javascript" src="/ckanext/spatial/js/wms_preview.js"></script>' in res, res
        assert 'CKAN.WMSPreview.setup("%s");' % wms_url.split('?')[0] in res
Exemple #21
0
def sync_irods(params, id):
    """
    Fetches a resource from database with the same path as user specified and 
    that matches an existing resource in CKAN.
    """
    from irods import getFileUserMetadata, rcModAccessControl
    rev = model.repo.new_revision()
    conn = get_connection_from_params(params)
    resource = Resource.get(id)
    path = params['path']
    extras = {}
    # Lets handle only resources with file names
    if resource.name:
        fname = "%s/%s" % (path, resource.name.split('/')[-1])
        log.debug(fname)
        i = 0
        access = rcModAccessControl()
        log.debug(access.getPath())
        if conn:
            for met in getFileUserMetadata(conn, fname):
                i += 1
                key, value, _ = met
                extras[key] = value
            resource.extras = extras
            Session.add(resource)
            conn.disconnect()
            model.repo.commit()
            rev.message = "Update from iRODS, matched file %s" % fname
            h.flash_success(
                "iRODS import to resource OK! Imported %s metadatas" % i)
        else:
            h.flash_error("Could not connect to iRODS!")
    else:
        h.flash_error("Resource is an URL, cannot import!")
    h.redirect_to(controller='package', action='resource_read', \
              id=resource.resource_group.package.name, \
              resource_id=resource.id)
def sync_irods(params, id):
    """
    Fetches a resource from database with the same path as user specified and 
    that matches an existing resource in CKAN.
    """
    from irods import getFileUserMetadata, rcModAccessControl
    rev = model.repo.new_revision()
    conn = get_connection_from_params(params)
    resource = Resource.get(id)
    path = params['path']
    extras = {}
    # Lets handle only resources with file names
    if resource.name:
        fname = "%s/%s" % (path, resource.name.split('/')[-1])
        log.debug(fname)
        i = 0
        access = rcModAccessControl()
        log.debug(access.getPath())
        if conn:
            for met in getFileUserMetadata(conn, fname):
                i += 1
                key, value, _ = met
                extras[key] = value
            resource.extras = extras
            Session.add(resource)
            conn.disconnect()
            model.repo.commit()
            rev.message = "Update from iRODS, matched file %s" % fname
            h.flash_success("iRODS import to resource OK! Imported %s metadatas" % i)
        else:
            h.flash_error("Could not connect to iRODS!")
    else:
        h.flash_error("Resource is an URL, cannot import!")
    h.redirect_to(controller='package', action='resource_read', \
              id=resource.resource_group.package.name, \
              resource_id=resource.id)
 def read_data(self, id, resource_id):
     res = Resource.get(resource_id)
     pkg = Package.get(id)
     c.pkg_dict = pkg.as_dict()
     c.package = pkg
     c.resource = get_action('resource_show')({'model': model},
                                                  {'id': resource_id})
     label = res.url.split(config.get('ckan.site_url') + '/storage/f/')[-1]
     label = urllib2.unquote(label)
     ofs = get_ofs()
     try:
         furl = ofs.get_url(BUCKET, label).split('file://')[-1]
     except FileNotFoundException:
         h.flash_error(_('Cannot do data mining on remote resource!'))
         url = h.url_for(controller='package', action='resource_read',
                         id=id, resource_id=resource_id)
         return redirect(url)
     wordstats = {}
     ret = {}
     if res.format in ('TXT', 'txt'):
         wdsf, wdspath = tempfile.mkstemp()
         os.write(wdsf, "%s\nmetadata description title information" % furl)
         with os.fdopen(wdsf, 'r') as wordfile:
             preproc = orngText.Preprocess()
             table = orngText.loadFromListWithCategories(wdspath)
             data = orngText.bagOfWords(table, preprocessor=preproc)
             words = orngText.extractWordNGram(data, threshold=10.0, measure='MI')
         for i in range(len(words)):
             d = words[i]
             wordstats = d.get_metas(str)
         for k, v in wordstats.items():
             if v.value > 10.0:
                 ret[unicode(k, 'utf8')] = v.value
         from operator import itemgetter
         c.data_tags = sorted(ret.iteritems(), key=itemgetter(1), reverse=True)[:30]
         os.remove(wdspath)
         for i in range(len(data)):
                 d = words[i]
                 wordstats = d.get_metas(str)
         words = []
         for k, v in wordstats.items():
             words.append(k)
         model.repo.new_revision()
         if not 'autoextracted_description' in pkg.extras:
             pkg.extras['autoextracted_description'] = ' '.join(words)
         pkg.save()
         return render('datamining/read.html')
     elif res.format in ('odt', 'doc', 'xls', 'ods', 'odp', 'ppt', 'doc', 'html'):
         textfd, textpath = convert_to_text(res, furl)
         if not textpath:
             h.flash_error(_('This file could not be mined for any data!'))
             os.close(textfd)
             return render('datamining/read.html')
         else:
             wdsf, wdspath = tempfile.mkstemp()
             os.write(wdsf, "%s\nmetadata description title information" % textpath)
             preproc = orngText.Preprocess()
             table = orngText.loadFromListWithCategories(wdspath)
             data = orngText.bagOfWords(table, preprocessor=preproc)
             words = orngText.extractWordNGram(data, threshold=10.0, measure='MI')
             for i in range(len(words)):
                 d = words[i]
                 wordstats = d.get_metas(str)
             for k, v in wordstats.items():
                 if v.value > 10.0:
                     ret[unicode(k, 'utf8')] = v.value
             from operator import itemgetter
             c.data_tags = sorted(ret.iteritems(), key=itemgetter(1), reverse=True)[:30]
             os.close(textfd)
             os.close(wdsf)
             os.remove(wdspath)
             os.remove(textpath)
             for i in range(len(data)):
                 d = words[i]
                 wordstats = d.get_metas(str)
             words = []
             for k, v in wordstats.items():
                 log.debug(k)
                 words.append(substitute_ascii_equivalents(k))
             model.repo.new_revision()
             if not 'autoextracted_description' in pkg.extras:
                 pkg.extras['autoextracted_description'] = ' '.join(words)
             pkg.save()
             return render('datamining/read.html')
     else:
         h.flash_error(_('This metadata document is not in proper format for data mining!'))
         url = h.url_for(controller='package', action='resource_read',
                         id=id, resource_id=resource_id)
         return redirect(url)