def dataproxy_resource_update(context, data_dict=None): """ Intercepts default resource_update action and encrypts password for dataproxy type resources Args: context: Request context. data_dict: Parsed request parameters. Returns: see get_action('resource_update'). Raises: Exception: if ckan.dataproxy.secret configuration not set. """ #If not set, default to empty string data_dict['url_type'] = data_dict.get('url_type', '') url_type = data_dict['url_type'] if url_type == 'dataproxy': secret = config.get('ckan.dataproxy.secret', False) if not secret: raise Exception('ckan.dataproxy.secret must be defined to encrypt passwords') #replace password with a _password_ placeholder password = data_dict.get('db_password', '') if password == '': #we don't want to overwrite existing password with empty string resource = Resource.get(data_dict['id']) data_dict['db_password'] = resource.extras['db_password'] else: data_dict['url'] = data_dict['url'].replace(password, '_password_') #encrypt db_password data_dict['db_password'] = hexlify(encrypt(secret, password)) site_url = config.get('ckan.site_url', '127.0.0.1') data_dict['url'] = '{0}/api/3/action/datastore_search?resource_id={1}&downloaded=true'.format(site_url, data_dict['id']) return orig_resource_update(context, data_dict)
def datastore_search(original_action, context, data_dict): ''' Execute a datastore search using the custom method in dataproxy ''' log.debug("datastore_search: context: {0} data_dict: {1}".format(context, data_dict)) logic.check_access('datastore_search', context) resource = Resource.get(data_dict['resource_id']) if resource is not None and resource.url_type == 'dataproxy': # Discard sorting and only return 10 rows cuz it's a data preview data_dict['sort'] = None data_dict['limit'] = 10 # execute search results=json.loads(SearchController().dataproxy_search(data_dict, resource)) log.debug("datastore_search: results: {0}".format(results)) # set total to limit results['result']['total']=data_dict['limit'] # Hack in an _id field because datatables expects it. Add a _id value to each row results['result']['fields']=[{"id":"_id", "type": "BIGINT"}] + results['result']['fields'] for i in range(0, len(results['result']['records'])): results['result']['records'][i]['_id'] = i+1 return results['result'] #Default action otherwise return original_action(context, data_dict)
def dataproxy_resource_update(context, data_dict=None): """ Intercepts default resource_update action and encrypts password for dataproxy type resources Args: context: Request context. data_dict: Parsed request parameters. Returns: see get_action('resource_update'). Raises: Exception: if ckan.dataproxy.secret configuration not set. """ #If not set, default to empty string data_dict['url_type'] = data_dict.get('url_type', '') url_type = data_dict['url_type'] if url_type == 'dataproxy': secret = config.get('ckan.dataproxy.secret', False) if not secret: raise Exception( 'ckan.dataproxy.secret must be defined to encrypt passwords') #replace password with a _password_ placeholder password = data_dict.get('db_password', '') if password == '': #we don't want to overwrite existing password with empty string resource = Resource.get(data_dict['id']) data_dict['db_password'] = resource.extras['db_password'] else: data_dict['url'] = data_dict['url'].replace(password, '_password_') #encrypt db_password data_dict['db_password'] = hexlify(encrypt(secret, password)) return orig_resource_update(context, data_dict)
def import_collection_to_package(params, id): """ Import a collection to dataset. Does not import whole file data but rather the metadata. """ from irods import irodsCollection path = params['path'] pkg = Package.get(id) conn = get_connection_from_params(params) if (conn): coll = irodsCollection(conn, path) from irods import iRodsOpen rev = model.repo.new_revision() i = 0 for obj in coll.getObjects(): extras = {} fname, _ = obj fpath = "%s/%s" % (coll.getCollName(), fname) f = iRodsOpen(conn, fpath, 'r') if f: i += 1 res = Resource.by_name(fname) if not res: res = Resource(url = '', name=fname, extras=extras, \ resource_type='file') for met in f.getUserMetadata(): key, value, _ = met extras[key] = value res.extras = extras resgrp = pkg.resource_groups[0] resgrp.resources.append(res) Session.add(res) Session.add(resgrp) rev.message = "Update from iRODS, matched file %s" % fname for met in coll.getUserMetadata(): key, value, _ = met pkg.extras[key] = value Session.add(pkg) model.repo.commit() conn.disconnect() h.flash_success("iRODS import to dataset OK! Imported %s resources." % i) else: h.flash_error("Could not connect to iRODS!") h.redirect_to(controller='package', action='read', id=id)
def search_action(self): """Routes dataproxy type resources to dataproxy_search method, else performs 'datastore_search' action""" #TODO: No access control checks for dataproxy resources! request_data = self._get_request_data(try_url_params=True) resource = Resource.get(request_data['resource_id']) if resource is not None and resource.url_type == 'dataproxy': pylons.response.headers['Content-Type'] = 'application/json;charset=utf-8' return self.dataproxy_search(request_data, resource) #Default action otherwise return self.action('datastore_search', ver=3)
def search_action(self): """Routes dataproxy type resources to dataproxy_search method, else performs 'datastore_search' action""" #TODO: No access control checks for dataproxy resources! request_data = self._get_request_data(try_url_params=True) downloaded = False if 'downloaded' in request_data: downloaded = str(request_data['downloaded']).upper() == 'TRUE' resource = Resource.get(request_data['resource_id']) if resource is not None and resource.url_type == 'dataproxy': if (downloaded): pylons.response.headers['Content-Type'] = 'text/csv' pylons.response.headers[ 'Content-Disposition'] = 'attachment;filename="{0}.{1}"'.format( resource.name, resource.format) datas = json.loads( self.dataproxy_search(request_data, resource)) result = datas['result'] tmp = '' fields = result['fields'] for val in fields: field = val['id'] if isinstance(field, unicode): tmp += field.encode('utf-8') + ',' else: tmp += str(field) + ',' records = result['records'] for row in records: tmp += '\n' for val in fields: if val['id'] in row: cell = row[val['id']] if cell is not None: if isinstance(cell, unicode): tmp += '"' + cell.encode('utf-8') + '",' else: tmp += '"' + str(cell) + '",' else: tmp += ',' else: tmp += ',' return str(tmp) else: pylons.response.headers[ 'Content-Type'] = 'application/json;charset=utf-8' return self.dataproxy_search(request_data, resource) #Default action otherwise return self.action('datastore_search', ver=3)
def search_action(self): """Routes dataproxy type resources to dataproxy_search method, else performs 'datastore_search' action""" #TODO: No access control checks for dataproxy resources! request_data = self._get_request_data(try_url_params=True) log.info('{}'.format(request_data)) if 'resource_id' in request_data: resource = Resource.get(request_data['resource_id']) if resource is not None and resource.url_type == 'dataproxy': pylons.response.headers['Content-Type'] = 'application/json;charset=utf-8' return self.dataproxy_search(request_data, resource) #Default action otherwise return self.action('datastore_search', ver=3)
def update_res_license(context, res_dict, license_id): session = context['session'] resource_id = res_dict['id'] log.debug(u'Updating license for resource {}: {}'.format(resource_id, license_id)) resource = Resource.get(res_dict['id']) res_license = session.query(CeonResourceLicense).filter(CeonResourceLicense.resource_id == resource.id).first() if res_license: log.debug(u'Deleting license res_license: {}'.format(res_license)) session.delete(res_license) new_res_license = CeonResourceLicense(resource_id = resource.id, license_id = license_id) session.merge(new_res_license) log.debug(u'Created license res_license: {}'.format(new_res_license)) return new_res_license
def search_action(self): """Routes dataproxy type resources to dataproxy_search method, else performs 'datastore_search' action""" #TODO: No access control checks for dataproxy resources! request_data = self._get_request_data(try_url_params=True) downloaded = False if 'downloaded' in request_data: downloaded = str(request_data['downloaded']).upper() == 'TRUE' resource = Resource.get(request_data['resource_id']) if resource is not None and resource.url_type == 'dataproxy': if(downloaded): pylons.response.headers['Content-Type'] = 'text/csv' pylons.response.headers['Content-Disposition'] = 'attachment;filename="{0}.{1}"'.format(resource.name, resource.format) datas = json.loads(self.dataproxy_search(request_data, resource)) result = datas['result'] tmp = '' fields = result['fields'] for val in fields: field = val['id'] if isinstance(field, unicode): tmp += field.encode('utf-8') + ',' else: tmp += str(field) + ',' records = result['records'] for row in records: tmp += '\n' for val in fields: if val['id'] in row: cell = row[val['id']] if cell is not None: if isinstance(cell, unicode): tmp += '"' + cell.encode('utf-8') + '",' else: tmp += '"' + str(cell) + '",' else: tmp += ',' else: tmp += ',' return str(tmp) else: pylons.response.headers['Content-Type'] = 'application/json;charset=utf-8' return self.dataproxy_search(request_data, resource) #Default action otherwise return self.action('datastore_search', ver=3)
def view(self, id): """ Renders a form for iRODS resource import, if it receives a POST request, it simply calls sync_irods function in order to do the actual import. """ res = Resource.get(id) context = {'model':model, 'user': c.user or c.author, 'resource': res } try: check_access('resource_update', context, {'id' : id}) except NotAuthorized: abort(401, _('Not authorized to see this page')) if ('save' in request.params): sync_irods(request.params, id) c.resource_name = res.name c.resource_id = res.id return render('ckanext/irods/irods.html')
def update_res_license(context, res_dict, license_id): session = context['session'] resource_id = res_dict['id'] log.debug(u'Updating license for resource {}: {}'.format(resource_id, license_id)) resource = Resource.get(res_dict['id']) res_license = session.query(CeonResourceLicense).filter(CeonResourceLicense.resource_id == resource.id).first() if res_license: res_license.license_id = license_id log.debug(u'Updated license res_license: {}'.format(res_license)) session.merge(res_license) return res_license else: new_res_license = CeonResourceLicense(resource_id = resource.id, license_id = license_id) session.merge(new_res_license) log.debug(u'Created license res_license: {}'.format(new_res_license)) return new_res_license
def view(self, id): """ Renders a form for iRODS resource import, if it receives a POST request, it simply calls sync_irods function in order to do the actual import. """ res = Resource.get(id) context = {'model': model, 'user': c.user or c.author, 'resource': res} try: check_access('resource_update', context, {'id': id}) except NotAuthorized: abort(401, _('Not authorized to see this page')) if ('save' in request.params): sync_irods(request.params, id) c.resource_name = res.name c.resource_id = res.id return render('ckanext/irods/irods.html')
def test_extractor_extract_update_ignored_format(self, send_task): """ extractor_extract for a resource with updated, ignored format. """ res_dict = factories.Resource(format='pdf') send_task.reset_mock() fake_process(res_dict) resource = Resource.get(res_dict['id']) resource.format = 'foo' resource.save() result = call_action('extractor_extract', id=res_dict['id']) assert_equal(result['status'], 'ignored', 'Wrong state') assert_true(result['task_id'] is None, 'Unexpected task ID') assert_equal(send_task.call_count, 0, 'Wrong number of extraction tasks.') assert_no_metadata(res_dict)
def test_extractor_extract_update_indexed_format(self, send_task): """ extractor_extract for a resource with updated, indexed format. """ res_dict = factories.Resource(format='pdf') send_task.reset_mock() fake_process(res_dict) resource = Resource.get(res_dict['id']) resource.format = 'doc' resource.save() result = call_action('extractor_extract', id=res_dict['id']) assert_equal(result['status'], 'update', 'Wrong state') assert_false(result['task_id'] is None, 'Missing task ID') assert_equal(result['task_id'], get_metadata(res_dict).task_id, 'Task IDs differ.') assert_equal(send_task.call_count, 1, 'Wrong number of extraction tasks.')
def update_resource_url(context, res_dict): if not 'url' in res_dict or not res_dict['url']: return res_dict if not 'url_type' in res_dict or 'upload' != res_dict['url_type']: return res_dict log.debug(u"Updating resource {} url {}".format(res_dict['id'], res_dict['url'])) res_url = remove_locales_from_url(res_dict['url']) log.debug(u"new url {}".format(res_url)) if res_dict['url'] != res_url: log.debug(u"here 1") res_dict['url'] = res_url session = context['session'] res = Resource.get(res_dict['id']) log.debug(u"here 2 {}".format(res)) if not res: raise Exception(u'Resource "{}" not found'.format(res_dict['id'])) res.url = res_url session.merge(res) log.debug(u"here 3 {}".format(res)) return res_dict
def _get_ids(self, only_with_metadata=False): """ Get list of resource IDs from command line arguments. Returns the specific IDs listed or all IDs if ``all`` was passed. If ``only_with_metadata`` is true and ``all`` was passed then only IDs of resources which have metadata are returned. """ from ckan.plugins import toolkit if len(self.args) < 1: _error('Missing argument. Specify one or more resource IDs ' + 'or "all".') if len(self.args) == 1 and self.args[0].lower() == 'all': if only_with_metadata: return sorted(toolkit.get_action('extractor_list')({}, {})) else: from ckan.model import Resource return sorted(r.id for r in Resource.active()) else: return self.args[:]
def test_link_and_map_shown(self): name = u'annakarenina' wms_url = 'http://maps.bgs.ac.uk/ArcGIS/services/BGS_Detailed_Geology/MapServer/WMSServer?' rev = model.repo.new_revision() pkg = Package.get(name) pr = Resource(url=wms_url, format='WMS') pkg.resources.append(pr) pkg.save() model.repo.commit_and_remove() # Load the dataset page and check if link appears offset = url_for(controller='package', action='read', id=name) res = self.app.get(offset) assert 'View available WMS layers' in res, res # Load the dataset map preview page and check if libraries are loaded offset = '/dataset/%s/map' % name res = self.app.get(offset) assert '<script type="text/javascript" src="/ckanext/spatial/js/wms_preview.js"></script>' in res, res assert 'CKAN.WMSPreview.setup("%s");' % wms_url.split('?')[0] in res
def sync_irods(params, id): """ Fetches a resource from database with the same path as user specified and that matches an existing resource in CKAN. """ from irods import getFileUserMetadata, rcModAccessControl rev = model.repo.new_revision() conn = get_connection_from_params(params) resource = Resource.get(id) path = params['path'] extras = {} # Lets handle only resources with file names if resource.name: fname = "%s/%s" % (path, resource.name.split('/')[-1]) log.debug(fname) i = 0 access = rcModAccessControl() log.debug(access.getPath()) if conn: for met in getFileUserMetadata(conn, fname): i += 1 key, value, _ = met extras[key] = value resource.extras = extras Session.add(resource) conn.disconnect() model.repo.commit() rev.message = "Update from iRODS, matched file %s" % fname h.flash_success( "iRODS import to resource OK! Imported %s metadatas" % i) else: h.flash_error("Could not connect to iRODS!") else: h.flash_error("Resource is an URL, cannot import!") h.redirect_to(controller='package', action='resource_read', \ id=resource.resource_group.package.name, \ resource_id=resource.id)
def sync_irods(params, id): """ Fetches a resource from database with the same path as user specified and that matches an existing resource in CKAN. """ from irods import getFileUserMetadata, rcModAccessControl rev = model.repo.new_revision() conn = get_connection_from_params(params) resource = Resource.get(id) path = params['path'] extras = {} # Lets handle only resources with file names if resource.name: fname = "%s/%s" % (path, resource.name.split('/')[-1]) log.debug(fname) i = 0 access = rcModAccessControl() log.debug(access.getPath()) if conn: for met in getFileUserMetadata(conn, fname): i += 1 key, value, _ = met extras[key] = value resource.extras = extras Session.add(resource) conn.disconnect() model.repo.commit() rev.message = "Update from iRODS, matched file %s" % fname h.flash_success("iRODS import to resource OK! Imported %s metadatas" % i) else: h.flash_error("Could not connect to iRODS!") else: h.flash_error("Resource is an URL, cannot import!") h.redirect_to(controller='package', action='resource_read', \ id=resource.resource_group.package.name, \ resource_id=resource.id)
def read_data(self, id, resource_id): res = Resource.get(resource_id) pkg = Package.get(id) c.pkg_dict = pkg.as_dict() c.package = pkg c.resource = get_action('resource_show')({'model': model}, {'id': resource_id}) label = res.url.split(config.get('ckan.site_url') + '/storage/f/')[-1] label = urllib2.unquote(label) ofs = get_ofs() try: furl = ofs.get_url(BUCKET, label).split('file://')[-1] except FileNotFoundException: h.flash_error(_('Cannot do data mining on remote resource!')) url = h.url_for(controller='package', action='resource_read', id=id, resource_id=resource_id) return redirect(url) wordstats = {} ret = {} if res.format in ('TXT', 'txt'): wdsf, wdspath = tempfile.mkstemp() os.write(wdsf, "%s\nmetadata description title information" % furl) with os.fdopen(wdsf, 'r') as wordfile: preproc = orngText.Preprocess() table = orngText.loadFromListWithCategories(wdspath) data = orngText.bagOfWords(table, preprocessor=preproc) words = orngText.extractWordNGram(data, threshold=10.0, measure='MI') for i in range(len(words)): d = words[i] wordstats = d.get_metas(str) for k, v in wordstats.items(): if v.value > 10.0: ret[unicode(k, 'utf8')] = v.value from operator import itemgetter c.data_tags = sorted(ret.iteritems(), key=itemgetter(1), reverse=True)[:30] os.remove(wdspath) for i in range(len(data)): d = words[i] wordstats = d.get_metas(str) words = [] for k, v in wordstats.items(): words.append(k) model.repo.new_revision() if not 'autoextracted_description' in pkg.extras: pkg.extras['autoextracted_description'] = ' '.join(words) pkg.save() return render('datamining/read.html') elif res.format in ('odt', 'doc', 'xls', 'ods', 'odp', 'ppt', 'doc', 'html'): textfd, textpath = convert_to_text(res, furl) if not textpath: h.flash_error(_('This file could not be mined for any data!')) os.close(textfd) return render('datamining/read.html') else: wdsf, wdspath = tempfile.mkstemp() os.write(wdsf, "%s\nmetadata description title information" % textpath) preproc = orngText.Preprocess() table = orngText.loadFromListWithCategories(wdspath) data = orngText.bagOfWords(table, preprocessor=preproc) words = orngText.extractWordNGram(data, threshold=10.0, measure='MI') for i in range(len(words)): d = words[i] wordstats = d.get_metas(str) for k, v in wordstats.items(): if v.value > 10.0: ret[unicode(k, 'utf8')] = v.value from operator import itemgetter c.data_tags = sorted(ret.iteritems(), key=itemgetter(1), reverse=True)[:30] os.close(textfd) os.close(wdsf) os.remove(wdspath) os.remove(textpath) for i in range(len(data)): d = words[i] wordstats = d.get_metas(str) words = [] for k, v in wordstats.items(): log.debug(k) words.append(substitute_ascii_equivalents(k)) model.repo.new_revision() if not 'autoextracted_description' in pkg.extras: pkg.extras['autoextracted_description'] = ' '.join(words) pkg.save() return render('datamining/read.html') else: h.flash_error(_('This metadata document is not in proper format for data mining!')) url = h.url_for(controller='package', action='resource_read', id=id, resource_id=resource_id) return redirect(url)