def delete(self, **kwargs): self.kwargs = kwargs user = User.objects.get_or_404(id=request.form.get('user_id')) member = get_by(self.organization.members, 'user', user) self.organization.members.remove(member) self.organization.save() return '', 204
def detect_by_url(self, row): url = row['url'] hashed_url = hash_url(url) found = False datasets = Dataset.objects.filter(resources__urlhash=hashed_url) for dataset in datasets: resource = get_by(dataset.resources, 'urlhash', hashed_url) self.resources.append({ 'dataset': dataset, 'resource': resource, 'data': row, }) found = True resources = CommunityResource.objects.filter(urlhash=hashed_url) for resource in resources: self.community_resources.append({ 'resource': resource, 'data': row, }) found = True if not found: log.error('No resource found by url', extra={ 'hashed_url': hashed_url, 'url': url })
def remote_organizations(self): response = self.get('organization_list') for name in response['result']: details = self.get('organization_show', {'id': name})['result'] organization = self.get_harvested(Organization, details['id']) organization.name = details['title'] organization.slug = details['name'] organization.description = details['description'] organization.image_url = details['image_url'] or None if self.config.get('users') is not None: for member in details['users']: user = self.get_harvested(User, member['id'], create=False) if user and not get_by(organization.members, 'user', user): role = 'admin' if member['capacity'] == 'admin' else 'editor' organization.members.append(Member(role=role, user=user)) yield organization if not organization.id: continue followers = self.get('group_follower_list', {'id': name})['result'] for follower in followers: user = self.get_harvested(User, follower['id'], create=False) if user: follow, created = FollowOrg.objects.get_or_create(follower=user, following=organization)
def get_resource(id): '''Fetch a resource given its UUID''' dataset = Dataset.objects(resources__id=id).first() if dataset: return get_by(dataset.resources, 'id', id) else: return CommunityResource.objects(id=id).first()
def remote_organizations(self): response = self.get('organization_list') for name in response['result']: details = self.get('organization_show', {'id': name})['result'] organization = self.get_harvested(Organization, details['id']) organization.name = details['title'] organization.slug = details['name'] organization.description = details['description'] organization.image_url = details['image_url'] or None if self.config.get('users') is not None: for member in details['users']: user = self.get_harvested(User, member['id'], create=False) if user and not get_by(organization.members, 'user', user): role = 'admin' if member[ 'capacity'] == 'admin' else 'editor' organization.members.append( Member(role=role, user=user)) yield organization if not organization.id: continue followers = self.get('group_follower_list', {'id': name})['result'] for follower in followers: user = self.get_harvested(User, follower['id'], create=False) if user: follow, created = FollowOrg.objects.get_or_create( follower=user, following=organization)
def detect_by_hashed_url(self, hashed_url, row): found = False try: datasets = Dataset.objects.filter(resources__urlhash=hashed_url) for dataset in datasets: resource = get_by(dataset.resources, 'urlhash', hashed_url) self.resources.append({ 'dataset': dataset, 'resource': resource, 'data': row, }) found = True except Dataset.DoesNotExist: pass try: resources = CommunityResource.objects.filter(urlhash=hashed_url) for resource in resources: self.community_resources.append({ 'resource': resource, 'data': row, }) found = True except CommunityResource.DoesNotExist: pass if not found: log.error('No object found for urlhash %s' % hashed_url)
def test_coverage_for_level(self): register_level('country', 'included', 'Included level') included = [TerritoryFactory(level='included') for _ in range(2)] excluded = [TerritoryFactory(level='country') for _ in range(2)] [ VisibleDatasetFactory(spatial=SpatialCoverageFactory( territories=[t.reference()])) for t in included ] [ VisibleDatasetFactory(spatial=SpatialCoverageFactory( territories=[t.reference()])) for t in excluded ] response = self.get(url_for('api.spatial_coverage', level='included')) self.assert200(response) self.assertEqual(len(response.json['features']), 2) for feature in response.json['features']: self.assertEqual(feature['type'], 'Feature') territory = get_by(included, 'id', ObjectId(feature['id'])) self.assertIsNotNone(territory) self.assertEqual(feature['geometry'], territory.geom) properties = feature['properties'] self.assertEqual(properties['name'], territory.name) self.assertEqual(properties['code'], territory.code) self.assertEqual(properties['level'], 'included') self.assertEqual(properties['datasets'], 1)
def publish(document, resource_id, action): if action == KafkaMessageType.DELETED: resource = None else: resource = serialize_resource_for_event(get_by(document.resources, 'id', resource_id)) message_type = f'resource.{action.value}' produce(id=str(resource_id), message_type=message_type, document=resource, dataset_id=str(document.id))
def on_resource_created(dataset, resource_id=None): log.debug('on_resource_created triggered in link_proxy') if not resource_id: log.error('No resource_id provided') return resource = get_by(dataset.resources, 'id', resource_id) log.debug('link_proxy sending check for %s' % resource.url) check.delay(dataset.id, resource.id, resource.url)
def delete(self, **kwargs): self.kwargs = kwargs org = self.get_object() user = User.objects.get_or_404(id=request.form.get('user_id')) member = get_by(org.members, 'user', user) org.members.remove(member) org.save() return '', 204
def delete(self, slug, rid): dataset = Dataset.objects.get_or_404(slug=slug) resource = get_by(dataset.resources, 'id', UUID(rid)) if not resource: abort(404) dataset.resources.remove(resource) dataset.save() return '', 204
def put(self, dataset): '''Reorder resources''' new_resources = [] for rid in request.json: resource = get_by(dataset.resources, 'id', UUID(rid)) new_resources.append(resource) dataset.resources = new_resources dataset.save() return dataset.resources, 200
def resource_redirect(id): ''' Redirect to the latest version of a resource given its identifier. ''' dataset = Dataset.objects(resources__id=id).first() if dataset: resource = get_by(dataset.resources, 'id', id) else: resource = CommunityResource.objects(id=id).first() return redirect(resource.url.strip()) if resource else abort(404)
def on_form_valid(self, form): user = User.objects.get_or_404(id=form.pk.data) member = get_by(self.organization.members, 'user', user) if member: member.role = form.value.data else: member = Member(user=user, role=form.value.data or 'editor') self.organization.members.append(member) self.organization.save() return '', 200
def on_form_valid(self, form): user = User.objects.get_or_404(id=form.pk.data) member = get_by(self.organization.members, 'user', user) if member: member.role = form.value.data else: member = Member(user=user, role=form.value.data or 'editor') self.organization.members.append(member) self.organization.save() notify_new_member.delay(self.organization, member) return '', 200
def put(self, slug, rid): dataset = Dataset.objects.get_or_404(slug=slug) resource = get_by(dataset.resources, 'id', UUID(rid)) if not resource: abort(404) form = ResourceForm(request.form, instance=resource, csrf_enabled=False) if not form.validate(): return {'errors': form.errors}, 400 form.populate_obj(resource) dataset.save() return marshal(resource, resource_fields)
def resource_from_rdf(graph_or_distrib, dataset=None): ''' Map a Resource domain model to a DCAT/RDF graph ''' if isinstance(graph_or_distrib, RdfResource): distrib = graph_or_distrib else: node = graph_or_distrib.value(predicate=RDF.type, object=DCAT.Distribution) distrib = graph_or_distrib.resource(node) download_url = url_from_rdf(distrib, DCAT.downloadURL) access_url = url_from_rdf(distrib, DCAT.accessURL) url = safe_unicode(download_url or access_url) # we shouldn't create resources without URLs if not url: log.warning(f'Resource without url: {distrib}') return if dataset: resource = get_by(dataset.resources, 'url', url) if not dataset or not resource: resource = Resource() if dataset: dataset.resources.append(resource) resource.title = title_from_rdf(distrib, url) resource.url = url resource.description = sanitize_html(distrib.value(DCT.description)) resource.filesize = rdf_value(distrib, DCAT.bytesSize) resource.mime = rdf_value(distrib, DCAT.mediaType) fmt = rdf_value(distrib, DCT.format) if fmt: resource.format = fmt.lower() checksum = distrib.value(SPDX.checksum) if checksum: algorithm = checksum.value(SPDX.algorithm).identifier algorithm = CHECKSUM_ALGORITHMS.get(algorithm) if algorithm: resource.checksum = Checksum() resource.checksum.value = rdf_value(checksum, SPDX.checksumValue) resource.checksum.type = algorithm resource.published = rdf_value(distrib, DCT.issued, resource.published) resource.modified = rdf_value(distrib, DCT.modified, resource.modified) identifier = rdf_value(distrib, DCT.identifier) if identifier: resource.extras['dct:identifier'] = identifier if isinstance(distrib.identifier, URIRef): resource.extras['uri'] = distrib.identifier.toPython() return resource
def resource_from_rdf(graph_or_distrib, dataset=None): ''' Map a Resource domain model to a DCAT/RDF graph ''' if isinstance(graph_or_distrib, RdfResource): distrib = graph_or_distrib else: node = graph_or_distrib.value(predicate=RDF.type, object=DCAT.Distribution) distrib = graph_or_distrib.resource(node) download_url = url_from_rdf(distrib, DCAT.downloadURL) access_url = url_from_rdf(distrib, DCAT.accessURL) url = safe_unicode(download_url or access_url) if dataset: resource = get_by(dataset.resources, 'url', url) if not dataset or not resource: resource = Resource() if dataset: dataset.resources.append(resource) resource.title = title_from_rdf(distrib, url) resource.url = url resource.description = sanitize_html(distrib.value(DCT.description)) resource.filesize = rdf_value(distrib, DCAT.bytesSize) resource.mime = rdf_value(distrib, DCAT.mediaType) fmt = rdf_value(distrib, DCT.term('format')) if fmt: resource.format = fmt.lower() checksum = distrib.value(SPDX.checksum) if checksum: algorithm = checksum.value(SPDX.algorithm).identifier algorithm = CHECKSUM_ALGORITHMS.get(algorithm) if algorithm: resource.checksum = Checksum() resource.checksum.value = rdf_value(checksum, SPDX.checksumValue) resource.checksum.type = algorithm resource.published = rdf_value(distrib, DCT.issued, resource.published) resource.modified = rdf_value(distrib, DCT.modified, resource.modified) identifier = rdf_value(distrib, DCT.identifier) if identifier: resource.extras['dct:identifier'] = identifier if isinstance(distrib.identifier, URIRef): resource.extras['uri'] = distrib.identifier.toPython() return resource
def get(self, rid): dataset = Dataset.objects(resources__id=rid).first() if dataset: resource = get_by(dataset.resources, 'id', rid) else: resource = CommunityResource.objects(id=rid).first() if not resource: apiv2.abort(404, 'Resource does not exist') # Manually marshalling to make sure resource.dataset is in the scope. # See discussions in https://github.com/opendatateam/udata/pull/2732/files return marshal( { 'resource': resource, 'dataset_id': dataset.id if dataset else None }, specific_resource_fields)
def detect_by_resource_id(self, resource_id, row): try: dataset = Dataset.objects.get(resources__id=resource_id) resource = get_by(dataset.resources, 'id', uuid.UUID(resource_id)) self.resources.append({ 'dataset': dataset, 'resource': resource, 'data': row, }) except Dataset.DoesNotExist: try: resource = CommunityResource.objects.get(id=resource_id) self.community_resources.append({ 'resource': resource, 'data': row, }) except CommunityResource.DoesNotExist: log.error('No object found for resource_id %s' % resource_id)
def check(self, did, rid, url): log.debug('Sending check for url %s' % url) r = requests.post('%s/' % LINK_PROXY_URL, json={ 'location': url, }) if not r.status_code == 200: log.error('link-proxy responded w/ status code %s' % r.status_code) return data = r.json() if '_id' not in data: log.error('link-proxy did not respond with an _id (%s)' % data) return check_id = data['_id'] dataset = Dataset.objects.get(id=did) resource = get_by(dataset.resources, 'id', rid) resource.extras['link_proxy:check_id'] = check_id resource.save(signal_kwargs={'ignores': ['post_save']}) log.debug('Check sent for url %s with id %s' % (url, check_id))
def check_resources(self, number): '''Check <number> of URLs that have not been (recently) checked''' if not current_app.config.get('LINKCHECKING_ENABLED'): log.error('Link checking is disabled.') return base_pipeline = [ {'$match': {'resources': {'$gt': []}}}, {'$project': {'resources._id': True, 'resources.extras.check:date': True}}, {'$unwind': '$resources'}, ] # unchecked resources pipeline = base_pipeline + [ {'$match': {'resources.extras.check:date': {'$eq': None}}}, {'$limit': number} ] resources = list(Dataset.objects.aggregate(*pipeline)) # not recently checked resources slots_left = number - len(resources) if slots_left: pipeline = base_pipeline + [ {'$match': {'resources.extras.check:date': {'$ne': None}}}, {'$sort': {'resources.extras.check:date': 1}}, {'$limit': slots_left} ] resources += list(Dataset.objects.aggregate(*pipeline)) nb_resources = len(resources) log.info('Checking %s resources...', nb_resources) for idx, dataset_resource in enumerate(resources): dataset_obj = Dataset.objects.get(id=dataset_resource['_id']) resource_id = dataset_resource['resources']['_id'] rid = uuid.UUID(resource_id) resource_obj = get_by(dataset_obj.resources, 'id', rid) log.info('Checking resource %s (%s/%s)', resource_id, idx + 1, nb_resources) if resource_obj.need_check(): check_resource(resource_obj) else: log.info("--> Skipping this resource, cache is fresh enough.") log.info('Done.')
def test_coverage_for_level(self): GeoLevelFactory(id='top') GeoLevelFactory(id='sub', parents=['top']) GeoLevelFactory(id='child', parents=['sub']) topzones, subzones, childzones = [], [], [] for _ in range(2): zone = GeoZoneFactory(level='top') topzones.append(zone) for _ in range(2): subzone = GeoZoneFactory(level='sub', parents=[zone.id]) subzones.append(subzone) for _ in range(2): childzone = GeoZoneFactory( level='child', parents=[zone.id, subzone.id]) childzones.append(childzone) for zone in topzones + subzones + childzones: VisibleDatasetFactory( spatial=SpatialCoverageFactory(zones=[zone.id])) response = self.get(url_for('api.spatial_coverage', level='sub')) self.assert200(response) self.assertEqual(len(response.json['features']), len(subzones)) for feature in response.json['features']: self.assertEqual(feature['type'], 'Feature') zone = get_by(subzones, 'id', feature['id']) self.assertIsNotNone(zone) self.assertJsonEqual(feature['geometry'], zone.geom) properties = feature['properties'] self.assertEqual(properties['name'], zone.name) self.assertEqual(properties['code'], zone.code) self.assertEqual(properties['level'], 'sub') # Nested levels datasets should be counted self.assertEqual(properties['datasets'], 3)
def handle_downloads(self, row, day): if 'url' in row: try: hashed_url = hash_url(row['url']) data = ( Dataset.objects(resources__urlhash=hashed_url).first() or CommunityResource.objects(urlhash=hashed_url).first() ) if isinstance(data, Dataset): dataset = data resource = get_by(dataset.resources, 'urlhash', hashed_url) log.debug('Found resource download: %s', resource.url) self.count(resource, day, row) metric = ResourceViews(resource) metric.compute() # Use the MongoDB positionnal operator ($) cmd = 'set__resources__S__metrics__{0}'.format(metric.name) qs = Dataset.objects(id=dataset.id, resources__id=resource.id) qs.update(**{cmd: metric.value}) if dataset.organization: OrgResourcesDownloads(dataset.organization).compute() elif isinstance(data, CommunityResource): resource = data log.debug('Found community resource download: %s', resource.url) self.count(resource, day, row) metric = CommunityResourceViews(resource) metric.compute() resource.metrics[metric.name] = metric.value resource.save() except: log.exception('Unable to count download for %s', row['url']) if 'subtable' in row: for subrow in row['subtable']: self.handle_downloads(subrow, day)
def test_coverage_for_level(self): GeoLevelFactory(id='top') GeoLevelFactory(id='sub', parents=['top']) GeoLevelFactory(id='child', parents=['sub']) topzones, subzones, childzones = [], [], [] for _ in range(2): zone = GeoZoneFactory(level='top') topzones.append(zone) for _ in range(2): subzone = GeoZoneFactory(level='sub', parents=[zone.id]) subzones.append(subzone) for _ in range(2): childzone = GeoZoneFactory( level='child', parents=[zone.id, subzone.id]) childzones.append(childzone) for zone in topzones + subzones + childzones: VisibleDatasetFactory( spatial=SpatialCoverageFactory(zones=[zone.id])) response = self.get(url_for('api.spatial_coverage', level='sub')) self.assert200(response) self.assertEqual(len(response.json['features']), len(subzones)) for feature in response.json['features']: self.assertEqual(feature['type'], 'Feature') zone = get_by(subzones, 'id', feature['id']) self.assertIsNotNone(zone) assert_json_equal(feature['geometry'], zone.geom) properties = feature['properties'] self.assertEqual(properties['name'], zone.name) self.assertEqual(properties['code'], zone.code) self.assertEqual(properties['level'], 'sub') # Nested levels datasets should be counted self.assertEqual(properties['datasets'], 3)
def populate_obj(self, obj, name): if not self.has_data: return initial_values = getattr(obj, name, []) new_values = [] class Holder(object): pass holder = Holder() for idx, field in enumerate(self): initial = None if hasattr(self.nested_model, 'id') and 'id' in field.data: id = self.nested_model.id.to_python(field.data['id']) initial = get_by(initial_values, 'id', id) holder.nested = initial or self.nested_model() field.populate_obj(holder, 'nested') new_values.append(holder.nested) setattr(obj, name, new_values)
def _add_entry(self, formdata=None, data=unset_value, index=None): ''' Fill the form with previous data if necessary to handle partial update ''' if formdata: prefix = '-'.join((self.name, str(index))) basekey = '-'.join((prefix, '{0}')) idkey = basekey.format('id') if prefix in formdata: formdata[idkey] = formdata.pop(prefix) if hasattr(self.nested_model, 'id') and idkey in formdata: id = self.nested_model.id.to_python(formdata[idkey]) data = get_by(self.initial_data, 'id', id) initial = flatten_json(self.nested_form, data.to_mongo(), prefix) for key, value in initial.items(): if key not in formdata: formdata[key] = value else: data = None return super(NestedModelList, self)._add_entry(formdata, data, index)
def detect_by_resource_id(self, resource_id, row): try: # use filter().first() to avoid double matches errors dataset = Dataset.objects.filter(resources__id=resource_id).first() if not dataset: raise Dataset.DoesNotExist resource = get_by(dataset.resources, 'id', uuid.UUID(resource_id)) self.resources.append({ 'dataset': dataset, 'resource': resource, 'data': row, 'latest': True, }) except Dataset.DoesNotExist: try: resource = CommunityResource.objects.get(id=resource_id) self.community_resources.append({ 'resource': resource, 'data': row, 'latest': True, }) except CommunityResource.DoesNotExist: log.error('No object found for resource_id %s' % resource_id)
def test_coverage_for_level(self): register_level('country', 'included', 'Included level') included = [TerritoryFactory(level='included') for _ in range(2)] excluded = [TerritoryFactory(level='country') for _ in range(2)] [VisibleDatasetFactory(spatial=SpatialCoverageFactory(territories=[t.reference()])) for t in included] [VisibleDatasetFactory(spatial=SpatialCoverageFactory(territories=[t.reference()])) for t in excluded] response = self.get(url_for('api.spatial_coverage', level='included')) self.assert200(response) self.assertEqual(len(response.json['features']), 2) for feature in response.json['features']: self.assertEqual(feature['type'], 'Feature') territory = get_by(included, 'id', ObjectId(feature['id'])) self.assertIsNotNone(territory) self.assertEqual(feature['geometry'], territory.geom) properties = feature['properties'] self.assertEqual(properties['name'], territory.name) self.assertEqual(properties['code'], territory.code) self.assertEqual(properties['level'], 'included') self.assertEqual(properties['datasets'], 1)
def inject_organization_needs(sender, identity): if current_user.is_authenticated(): for org in Organization.objects(members__user=current_user.id): membership = get_by(org.members, 'user', current_user._get_current_object()) identity.provides.add(OrganizationNeed(membership.role, org.id))
def get_resource_or_404(self, dataset, id): resource = get_by(dataset.resources, 'id', id) if not resource: api.abort(404, 'Ressource does not exists') return resource
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], schema) # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = data['notes'] dataset.license = License.objects(id=data['license_id']).first() # dataset.license = license or License.objects.get(id='notspecified') dataset.tags = [t['name'] for t in data['tags']] dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom = None for extra in data['extras']: # GeoJSON representation (Polygon or Point) if extra['key'] == 'spatial': spatial_geom = json.loads(extra['value']) # Textual representation of the extent / location elif extra['key'] == 'spatial-text': log.debug('spatial-text value not handled') print 'spatial-text', extra['value'] # Linked Data URI representing the place name elif extra['key'] == 'spatial-uri': log.debug('spatial-uri value not handled') print 'spatial-uri', extra['value'] # Update frequency elif extra['key'] == 'frequency': print 'frequency', extra['value'] # Temporal coverage start elif extra['key'] == 'temporal_start': print 'temporal_start', extra['value'] temporal_start = daterange_start(extra['value']) continue # Temporal coverage end elif extra['key'] == 'temporal_end': print 'temporal_end', extra['value'] temporal_end = daterange_end(extra['value']) continue # else: # print extra['key'], extra['value'] dataset.extras[extra['key']] = extra['value'] if spatial_geom: dataset.spatial = SpatialCoverage() if spatial_geom['type'] == 'Polygon': coordinates = [spatial_geom['coordinates']] elif spatial_geom['type'] == 'MultiPolygon': coordinates = spatial_geom['coordinates'] else: HarvestException('Unsupported spatial geometry') dataset.spatial.geom = { 'type': 'MultiPolygon', 'coordinates': coordinates } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL if data.get('url'): dataset.extras['remote_url'] = data['url'] # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except: log.error('Unable to parse resource ID %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = res.get('description') resource.url = res['url'] resource.filetype = ('api' if res['resource_type'] == 'api' else 'remote') resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created return dataset
def test_attribute_not_found(self): '''get_by() should not fail if an object don't have the given attr''' assert get_by(TEST_LIST, 'inexistant', 'value') is None
def test_attribute_not_found(self): '''get_by() should not fail if an object don't have the given attr''' self.assertIsNone(get_by(TEST_LIST, 'inexistant', 'value'))
def test_not_found(self): '''get_by() should return None if not found''' self.assertIsNone(get_by(TEST_LIST, 'name', 'not-found'))
def test_find_object(self): '''get_by() should find an object in list''' obj_lst = [ObjDict(d) for d in TEST_LIST] result = get_by(obj_lst, 'name', 'bbb') self.assertEqual(result.name, 'bbb') self.assertEqual(result.another, 'ddd')
def test_find_dict(self): '''get_by() should find a dictionnary in list''' result = get_by(TEST_LIST, 'name', 'bbb') self.assertEqual(result, {'name': 'bbb', 'another': 'ddd'})
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], schema) # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = data['notes'] dataset.license = License.objects(id=data['license_id']).first() # dataset.license = license or License.objects.get(id='notspecified') dataset.tags = [t['name'] for t in data['tags'] if t['name']] dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom = None for extra in data['extras']: # GeoJSON representation (Polygon or Point) if extra['key'] == 'spatial': spatial_geom = json.loads(extra['value']) # Textual representation of the extent / location elif extra['key'] == 'spatial-text': log.debug('spatial-text value not handled') print 'spatial-text', extra['value'] # Linked Data URI representing the place name elif extra['key'] == 'spatial-uri': log.debug('spatial-uri value not handled') print 'spatial-uri', extra['value'] # Update frequency elif extra['key'] == 'frequency': print 'frequency', extra['value'] # Temporal coverage start elif extra['key'] == 'temporal_start': print 'temporal_start', extra['value'] temporal_start = daterange_start(extra['value']) continue # Temporal coverage end elif extra['key'] == 'temporal_end': print 'temporal_end', extra['value'] temporal_end = daterange_end(extra['value']) continue # else: # print extra['key'], extra['value'] dataset.extras[extra['key']] = extra['value'] if spatial_geom: dataset.spatial = SpatialCoverage() if spatial_geom['type'] == 'Polygon': coordinates = [spatial_geom['coordinates']] elif spatial_geom['type'] == 'MultiPolygon': coordinates = spatial_geom['coordinates'] else: HarvestException('Unsupported spatial geometry') dataset.spatial.geom = { 'type': 'MultiPolygon', 'coordinates': coordinates } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL if data.get('url'): dataset.extras['remote_url'] = data['url'] # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except: log.error('Unable to parse resource ID %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = res.get('description') resource.url = res['url'] resource.filetype = ('api' if res['resource_type'] == 'api' else 'remote') resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created return dataset
def remote_datasets(self): response = self.get('package_list') for name in response['result']: details = self.get('package_show', {'id': name})['result'] dataset = self.get_harvested(Dataset, details['id']) # Core attributes dataset.slug = details['name'] dataset.title = details['title'] dataset.description = details.get('notes', 'No description') dataset.license = License.objects(id=details['license_id']).first() or License.objects.get(id='notspecified') dataset.tags = [tag['name'].lower() for tag in details['tags']] dataset.frequency = self.map('frequency', details) or 'unknown' dataset.created_at = parse(details['metadata_created']) dataset.last_modified = parse(details['metadata_modified']) if any_field(details, 'territorial_coverage', 'territorial_coverage_granularity'): coverage = TerritorialCoverage( codes=[code.strip() for code in details.get('territorial_coverage', '').split(',') if code.strip()], granularity=self.map('territorial_coverage_granularity', details), ) dataset.extras['territorial_coverage'] = coverage try: dataset.spatial = territorial_to_spatial(dataset) except Exception as e: print 'Error while processing spatial coverage for {0}:'.format(dataset.title), e if all_field(details, 'temporal_coverage_from', 'temporal_coverage_to'): try: dataset.temporal_coverage = db.DateRange( start=daterange_start(details.get('temporal_coverage_from')), end=daterange_end(details.get('temporal_coverage_to')), ) except: log.error('Unable to parse temporal coverage for dataset %s', details['id']) # Organization if details.get('organization'): dataset.organization = self.get_harvested(Organization, details['organization']['id'], False) else: # Need to fetch user from roles roles = self.get('roles_show', {'domain_object': name})['result']['roles'] for role in roles: if role['role'] == 'admin' and role['context'] == 'Package': dataset.owner = self.get_harvested(User, role['user_id']) break # Supplier if details.get('supplier_id'): dataset.supplier = self.get_harvested(Organization, details['supplier_id'], False) # Remote URL if details.get('url'): dataset.extras['remote_url'] = details['url'] # Extras if 'extras' in details: extra_mapping = self.harvester.mapping.get('from_extras', {}) for extra in details['extras']: if extra['key'] in self.harvester.mapping: value = self.harvester.mapping[extra['key']].get(extra['value']) else: value = extra['value'] if extra['key'] in extra_mapping: setattr(dataset, extra_mapping[extra['key']], value) else: dataset.extras[extra['key']] = value # Resources for res in details['resources']: try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except: log.error('Unable to parse resource %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.url = res['url'] resource.description = res.get('description') resource.format = res.get('format') resource.hash = res.get('hash') resource.created = parse(res['created']) resource.modified = parse(res['revision_timestamp']) resource.published = resource.published or resource.created yield dataset if dataset.id: followers = self.get('dataset_follower_list', {'id': name})['result'] for follower in followers: user = self.get_harvested(User, follower['id'], False) if user: follow, created = FollowDataset.objects.get_or_create(follower=user, following=dataset)
def test_find_dict(self): '''get_by() should find a dictionnary in list''' result = get_by(TEST_LIST, 'name', 'bbb') assert result == {'name': 'bbb', 'another': 'ddd'}
def process(self, item): response = self.get(item.remote_id) encoding = chardet.detect(response.content)['encoding'] xml = self.parse_xml(response.content.decode(encoding)) metadata = xml['metadata'] # Resolve and remote id from metadata item.remote_id = metadata['id'] dataset = self.get_dataset(metadata['id']) dataset.title = metadata['title'] dataset.frequency = FREQUENCIES.get(metadata['frequency'], 'unknown') dataset.description = metadata['notes'] dataset.private = metadata['private'] dataset.tags = sorted(set(metadata['tags'])) if metadata.get('license_id'): dataset.license = License.objects.get(id=metadata['license_id']) if (metadata.get('temporal_coverage_from') and metadata.get('temporal_coverage_to')): dataset.temporal_coverage = db.DateRange( start=metadata['temporal_coverage_from'], end=metadata['temporal_coverage_to']) if (metadata.get('territorial_coverage_code') or metadata.get('territorial_coverage_granularity')): dataset.spatial = SpatialCoverage() if metadata.get('territorial_coverage_granularity'): dataset.spatial.granularity = GRANULARITIES.get( metadata['territorial_coverage_granularity']) if metadata.get('territorial_coverage_code'): dataset.spatial.zones = [ ZONES[metadata['territorial_coverage_code']] ] dataset.resources = [] cle = get_by(metadata['resources'], 'format', 'cle') for row in metadata['resources']: if row['format'] == 'cle': continue else: resource = Resource(title=row['name'], description=(row['description'] + '\n\n' + SSL_COMMENT).strip(), filetype='remote', url=row['url'], format=row['format']) if resource.format == 'csv' and cle: resource.checksum = Checksum(type='sha256', value=self.get( cle['url']).text) if row.get('last_modified'): resource.modified = row['last_modified'] dataset.resources.append(resource) if metadata.get('author'): dataset.extras['author'] = metadata['author'] if metadata.get('author_email'): dataset.extras['author_email'] = metadata['author_email'] if metadata.get('maintainer'): dataset.extras['maintainer'] = metadata['maintainer'] if metadata.get('maintainer_email'): dataset.extras['maintainer_email'] = metadata['maintainer_email'] for extra in metadata['extras']: dataset.extras[extra['key']] = extra['value'] return dataset
def get_resource(self, dataset, url): resource = get_by(dataset.resources, 'url', url) if not resource: return True, Resource(url=url) return False, resource
def test_find_object(self): '''get_by() should find an object in list''' obj_lst = [ObjDict(d) for d in TEST_LIST] result = get_by(obj_lst, 'name', 'bbb') assert result.name == 'bbb' assert result.another == 'ddd'
def test_not_found(self): '''get_by() should return None if not found''' assert get_by(TEST_LIST, 'name', 'not-found') is None
def process(self, item): response = self.get(item.remote_id) encoding = chardet.detect(response.content)['encoding'] xml = self.parse_xml(response.content.decode(encoding)) metadata = xml['metadata'] # Resolve and remote id from metadata item.remote_id = metadata['id'] dataset = self.get_dataset(metadata['id']) dataset.title = metadata['title'] dataset.frequency = FREQUENCIES.get(metadata['frequency'], 'unknown') dataset.description = metadata['notes'] dataset.private = metadata['private'] dataset.tags = sorted(set(metadata['tags'])) if metadata.get('license_id'): dataset.license = License.objects.get(id=metadata['license_id']) if (metadata.get('temporal_coverage_from') and metadata.get('temporal_coverage_to')): dataset.temporal_coverage = db.DateRange( start=metadata['temporal_coverage_from'], end=metadata['temporal_coverage_to'] ) if (metadata.get('territorial_coverage_code') or metadata.get('territorial_coverage_granularity')): dataset.spatial = SpatialCoverage() if metadata.get('territorial_coverage_granularity'): dataset.spatial.granularity = GRANULARITIES.get( metadata['territorial_coverage_granularity']) if metadata.get('territorial_coverage_code'): dataset.spatial.zones = [ ZONES[metadata['territorial_coverage_code']]] dataset.resources = [] cle = get_by(metadata['resources'], 'format', 'cle') for row in metadata['resources']: if row['format'] == 'cle': continue else: resource = Resource( title=row['name'], description=( row['description'] + '\n\n' + SSL_COMMENT).strip(), filetype='remote', url=row['url'], format=row['format'] ) if resource.format == 'csv' and cle: resource.checksum = Checksum( type='sha256', value=self.get(cle['url']).text) if row.get('last_modified'): resource.modified = row['last_modified'] dataset.resources.append(resource) if metadata.get('author'): dataset.extras['author'] = metadata['author'] if metadata.get('author_email'): dataset.extras['author_email'] = metadata['author_email'] if metadata.get('maintainer'): dataset.extras['maintainer'] = metadata['maintainer'] if metadata.get('maintainer_email'): dataset.extras['maintainer_email'] = metadata['maintainer_email'] for extra in metadata['extras']: dataset.extras[extra['key']] = extra['value'] return dataset