Example #1
0
 def delete(self, **kwargs):
     self.kwargs = kwargs
     user = User.objects.get_or_404(id=request.form.get('user_id'))
     member = get_by(self.organization.members, 'user', user)
     self.organization.members.remove(member)
     self.organization.save()
     return '', 204
 def detect_by_url(self, row):
     url = row['url']
     hashed_url = hash_url(url)
     found = False
     datasets = Dataset.objects.filter(resources__urlhash=hashed_url)
     for dataset in datasets:
         resource = get_by(dataset.resources, 'urlhash', hashed_url)
         self.resources.append({
             'dataset': dataset,
             'resource': resource,
             'data': row,
         })
         found = True
     resources = CommunityResource.objects.filter(urlhash=hashed_url)
     for resource in resources:
         self.community_resources.append({
             'resource': resource,
             'data': row,
         })
         found = True
     if not found:
         log.error('No resource found by url',
                   extra={
                       'hashed_url': hashed_url,
                       'url': url
                   })
Example #3
0
    def remote_organizations(self):
        response = self.get('organization_list')
        for name in response['result']:
            details = self.get('organization_show', {'id': name})['result']
            organization = self.get_harvested(Organization, details['id'])
            organization.name = details['title']
            organization.slug = details['name']
            organization.description = details['description']
            organization.image_url = details['image_url'] or None

            if self.config.get('users') is not None:
                for member in details['users']:
                    user = self.get_harvested(User, member['id'], create=False)
                    if user and not get_by(organization.members, 'user', user):
                        role = 'admin' if member['capacity'] == 'admin' else 'editor'
                        organization.members.append(Member(role=role, user=user))

            yield organization

            if not organization.id:
                continue

            followers = self.get('group_follower_list', {'id': name})['result']
            for follower in followers:
                user = self.get_harvested(User, follower['id'], create=False)
                if user:
                    follow, created = FollowOrg.objects.get_or_create(follower=user, following=organization)
Example #4
0
def get_resource(id):
    '''Fetch a resource given its UUID'''
    dataset = Dataset.objects(resources__id=id).first()
    if dataset:
        return get_by(dataset.resources, 'id', id)
    else:
        return CommunityResource.objects(id=id).first()
Example #5
0
    def remote_organizations(self):
        response = self.get('organization_list')
        for name in response['result']:
            details = self.get('organization_show', {'id': name})['result']
            organization = self.get_harvested(Organization, details['id'])
            organization.name = details['title']
            organization.slug = details['name']
            organization.description = details['description']
            organization.image_url = details['image_url'] or None

            if self.config.get('users') is not None:
                for member in details['users']:
                    user = self.get_harvested(User, member['id'], create=False)
                    if user and not get_by(organization.members, 'user', user):
                        role = 'admin' if member[
                            'capacity'] == 'admin' else 'editor'
                        organization.members.append(
                            Member(role=role, user=user))

            yield organization

            if not organization.id:
                continue

            followers = self.get('group_follower_list', {'id': name})['result']
            for follower in followers:
                user = self.get_harvested(User, follower['id'], create=False)
                if user:
                    follow, created = FollowOrg.objects.get_or_create(
                        follower=user, following=organization)
Example #6
0
 def detect_by_hashed_url(self, hashed_url, row):
     found = False
     try:
         datasets = Dataset.objects.filter(resources__urlhash=hashed_url)
         for dataset in datasets:
             resource = get_by(dataset.resources, 'urlhash', hashed_url)
             self.resources.append({
                 'dataset': dataset,
                 'resource': resource,
                 'data': row,
             })
             found = True
     except Dataset.DoesNotExist:
         pass
     try:
         resources = CommunityResource.objects.filter(urlhash=hashed_url)
         for resource in resources:
             self.community_resources.append({
                 'resource': resource,
                 'data': row,
             })
             found = True
     except CommunityResource.DoesNotExist:
         pass
     if not found:
         log.error('No object found for urlhash %s' % hashed_url)
Example #7
0
    def test_coverage_for_level(self):
        register_level('country', 'included', 'Included level')
        included = [TerritoryFactory(level='included') for _ in range(2)]
        excluded = [TerritoryFactory(level='country') for _ in range(2)]
        [
            VisibleDatasetFactory(spatial=SpatialCoverageFactory(
                territories=[t.reference()])) for t in included
        ]
        [
            VisibleDatasetFactory(spatial=SpatialCoverageFactory(
                territories=[t.reference()])) for t in excluded
        ]

        response = self.get(url_for('api.spatial_coverage', level='included'))
        self.assert200(response)
        self.assertEqual(len(response.json['features']), 2)

        for feature in response.json['features']:
            self.assertEqual(feature['type'], 'Feature')

            territory = get_by(included, 'id', ObjectId(feature['id']))
            self.assertIsNotNone(territory)
            self.assertEqual(feature['geometry'], territory.geom)

            properties = feature['properties']
            self.assertEqual(properties['name'], territory.name)
            self.assertEqual(properties['code'], territory.code)
            self.assertEqual(properties['level'], 'included')
            self.assertEqual(properties['datasets'], 1)
Example #8
0
def publish(document, resource_id, action):
    if action == KafkaMessageType.DELETED:
        resource = None
    else:
        resource = serialize_resource_for_event(get_by(document.resources, 'id', resource_id))
    message_type = f'resource.{action.value}'
    produce(id=str(resource_id), message_type=message_type, document=resource, dataset_id=str(document.id))
Example #9
0
def on_resource_created(dataset, resource_id=None):
    log.debug('on_resource_created triggered in link_proxy')
    if not resource_id:
        log.error('No resource_id provided')
        return
    resource = get_by(dataset.resources, 'id', resource_id)
    log.debug('link_proxy sending check for %s' % resource.url)
    check.delay(dataset.id, resource.id, resource.url)
Example #10
0
 def delete(self, **kwargs):
     self.kwargs = kwargs
     org = self.get_object()
     user = User.objects.get_or_404(id=request.form.get('user_id'))
     member = get_by(org.members, 'user', user)
     org.members.remove(member)
     org.save()
     return '', 204
Example #11
0
 def delete(self, slug, rid):
     dataset = Dataset.objects.get_or_404(slug=slug)
     resource = get_by(dataset.resources, 'id', UUID(rid))
     if not resource:
         abort(404)
     dataset.resources.remove(resource)
     dataset.save()
     return '', 204
Example #12
0
 def put(self, dataset):
     '''Reorder resources'''
     new_resources = []
     for rid in request.json:
         resource = get_by(dataset.resources, 'id', UUID(rid))
         new_resources.append(resource)
     dataset.resources = new_resources
     dataset.save()
     return dataset.resources, 200
Example #13
0
def resource_redirect(id):
    '''
    Redirect to the latest version of a resource given its identifier.
    '''
    dataset = Dataset.objects(resources__id=id).first()
    if dataset:
        resource = get_by(dataset.resources, 'id', id)
    else:
        resource = CommunityResource.objects(id=id).first()
    return redirect(resource.url.strip()) if resource else abort(404)
Example #14
0
 def on_form_valid(self, form):
     user = User.objects.get_or_404(id=form.pk.data)
     member = get_by(self.organization.members, 'user', user)
     if member:
         member.role = form.value.data
     else:
         member = Member(user=user, role=form.value.data or 'editor')
         self.organization.members.append(member)
     self.organization.save()
     return '', 200
Example #15
0
 def on_form_valid(self, form):
     user = User.objects.get_or_404(id=form.pk.data)
     member = get_by(self.organization.members, 'user', user)
     if member:
         member.role = form.value.data
     else:
         member = Member(user=user, role=form.value.data or 'editor')
         self.organization.members.append(member)
     self.organization.save()
     notify_new_member.delay(self.organization, member)
     return '', 200
Example #16
0
 def put(self, slug, rid):
     dataset = Dataset.objects.get_or_404(slug=slug)
     resource = get_by(dataset.resources, 'id', UUID(rid))
     if not resource:
         abort(404)
     form = ResourceForm(request.form, instance=resource, csrf_enabled=False)
     if not form.validate():
         return {'errors': form.errors}, 400
     form.populate_obj(resource)
     dataset.save()
     return marshal(resource, resource_fields)
Example #17
0
def resource_from_rdf(graph_or_distrib, dataset=None):
    '''
    Map a Resource domain model to a DCAT/RDF graph
    '''
    if isinstance(graph_or_distrib, RdfResource):
        distrib = graph_or_distrib
    else:
        node = graph_or_distrib.value(predicate=RDF.type,
                                      object=DCAT.Distribution)
        distrib = graph_or_distrib.resource(node)

    download_url = url_from_rdf(distrib, DCAT.downloadURL)
    access_url = url_from_rdf(distrib, DCAT.accessURL)
    url = safe_unicode(download_url or access_url)
    # we shouldn't create resources without URLs
    if not url:
        log.warning(f'Resource without url: {distrib}')
        return

    if dataset:
        resource = get_by(dataset.resources, 'url', url)
    if not dataset or not resource:
        resource = Resource()
        if dataset:
            dataset.resources.append(resource)
    resource.title = title_from_rdf(distrib, url)
    resource.url = url
    resource.description = sanitize_html(distrib.value(DCT.description))
    resource.filesize = rdf_value(distrib, DCAT.bytesSize)
    resource.mime = rdf_value(distrib, DCAT.mediaType)
    fmt = rdf_value(distrib, DCT.format)
    if fmt:
        resource.format = fmt.lower()
    checksum = distrib.value(SPDX.checksum)
    if checksum:
        algorithm = checksum.value(SPDX.algorithm).identifier
        algorithm = CHECKSUM_ALGORITHMS.get(algorithm)
        if algorithm:
            resource.checksum = Checksum()
            resource.checksum.value = rdf_value(checksum, SPDX.checksumValue)
            resource.checksum.type = algorithm

    resource.published = rdf_value(distrib, DCT.issued, resource.published)
    resource.modified = rdf_value(distrib, DCT.modified, resource.modified)

    identifier = rdf_value(distrib, DCT.identifier)
    if identifier:
        resource.extras['dct:identifier'] = identifier

    if isinstance(distrib.identifier, URIRef):
        resource.extras['uri'] = distrib.identifier.toPython()

    return resource
Example #18
0
def resource_from_rdf(graph_or_distrib, dataset=None):
    '''
    Map a Resource domain model to a DCAT/RDF graph
    '''
    if isinstance(graph_or_distrib, RdfResource):
        distrib = graph_or_distrib
    else:
        node = graph_or_distrib.value(predicate=RDF.type,
                                      object=DCAT.Distribution)
        distrib = graph_or_distrib.resource(node)

    download_url = url_from_rdf(distrib, DCAT.downloadURL)
    access_url = url_from_rdf(distrib, DCAT.accessURL)
    url = safe_unicode(download_url or access_url)

    if dataset:
        resource = get_by(dataset.resources, 'url', url)
    if not dataset or not resource:
        resource = Resource()
        if dataset:
            dataset.resources.append(resource)
    resource.title = title_from_rdf(distrib, url)
    resource.url = url
    resource.description = sanitize_html(distrib.value(DCT.description))
    resource.filesize = rdf_value(distrib, DCAT.bytesSize)
    resource.mime = rdf_value(distrib, DCAT.mediaType)
    fmt = rdf_value(distrib, DCT.term('format'))
    if fmt:
        resource.format = fmt.lower()
    checksum = distrib.value(SPDX.checksum)
    if checksum:
        algorithm = checksum.value(SPDX.algorithm).identifier
        algorithm = CHECKSUM_ALGORITHMS.get(algorithm)
        if algorithm:
            resource.checksum = Checksum()
            resource.checksum.value = rdf_value(checksum, SPDX.checksumValue)
            resource.checksum.type = algorithm

    resource.published = rdf_value(distrib, DCT.issued, resource.published)
    resource.modified = rdf_value(distrib, DCT.modified, resource.modified)

    identifier = rdf_value(distrib, DCT.identifier)
    if identifier:
        resource.extras['dct:identifier'] = identifier

    if isinstance(distrib.identifier, URIRef):
        resource.extras['uri'] = distrib.identifier.toPython()

    return resource
Example #19
0
    def get(self, rid):
        dataset = Dataset.objects(resources__id=rid).first()
        if dataset:
            resource = get_by(dataset.resources, 'id', rid)
        else:
            resource = CommunityResource.objects(id=rid).first()
        if not resource:
            apiv2.abort(404, 'Resource does not exist')

        # Manually marshalling to make sure resource.dataset is in the scope.
        # See discussions in https://github.com/opendatateam/udata/pull/2732/files
        return marshal(
            {
                'resource': resource,
                'dataset_id': dataset.id if dataset else None
            }, specific_resource_fields)
Example #20
0
 def detect_by_resource_id(self, resource_id, row):
     try:
         dataset = Dataset.objects.get(resources__id=resource_id)
         resource = get_by(dataset.resources, 'id', uuid.UUID(resource_id))
         self.resources.append({
             'dataset': dataset,
             'resource': resource,
             'data': row,
         })
     except Dataset.DoesNotExist:
         try:
             resource = CommunityResource.objects.get(id=resource_id)
             self.community_resources.append({
                 'resource': resource,
                 'data': row,
             })
         except CommunityResource.DoesNotExist:
             log.error('No object found for resource_id %s' % resource_id)
Example #21
0
def check(self, did, rid, url):
    log.debug('Sending check for url %s' % url)
    r = requests.post('%s/' % LINK_PROXY_URL, json={
        'location': url,
    })
    if not r.status_code == 200:
        log.error('link-proxy responded w/ status code %s' % r.status_code)
        return
    data = r.json()
    if '_id' not in data:
        log.error('link-proxy did not respond with an _id (%s)' % data)
        return
    check_id = data['_id']
    dataset = Dataset.objects.get(id=did)
    resource = get_by(dataset.resources, 'id', rid)
    resource.extras['link_proxy:check_id'] = check_id
    resource.save(signal_kwargs={'ignores': ['post_save']})
    log.debug('Check sent for url %s with id %s' % (url, check_id))
Example #22
0
def check_resources(self, number):
    '''Check <number> of URLs that have not been (recently) checked'''
    if not current_app.config.get('LINKCHECKING_ENABLED'):
        log.error('Link checking is disabled.')
        return

    base_pipeline = [
        {'$match': {'resources': {'$gt': []}}},
        {'$project': {'resources._id': True,
                      'resources.extras.check:date': True}},
        {'$unwind': '$resources'},
    ]
    # unchecked resources
    pipeline = base_pipeline + [
        {'$match': {'resources.extras.check:date': {'$eq': None}}},
        {'$limit': number}
    ]
    resources = list(Dataset.objects.aggregate(*pipeline))
    # not recently checked resources
    slots_left = number - len(resources)
    if slots_left:
        pipeline = base_pipeline + [
            {'$match': {'resources.extras.check:date': {'$ne': None}}},
            {'$sort': {'resources.extras.check:date': 1}},
            {'$limit': slots_left}
        ]
        resources += list(Dataset.objects.aggregate(*pipeline))

    nb_resources = len(resources)
    log.info('Checking %s resources...', nb_resources)
    for idx, dataset_resource in enumerate(resources):
        dataset_obj = Dataset.objects.get(id=dataset_resource['_id'])
        resource_id = dataset_resource['resources']['_id']
        rid = uuid.UUID(resource_id)
        resource_obj = get_by(dataset_obj.resources, 'id', rid)
        log.info('Checking resource %s (%s/%s)',
                 resource_id, idx + 1, nb_resources)
        if resource_obj.need_check():
            check_resource(resource_obj)
        else:
            log.info("--> Skipping this resource, cache is fresh enough.")
    log.info('Done.')
Example #23
0
    def test_coverage_for_level(self):
        GeoLevelFactory(id='top')
        GeoLevelFactory(id='sub', parents=['top'])
        GeoLevelFactory(id='child', parents=['sub'])

        topzones, subzones, childzones = [], [], []
        for _ in range(2):
            zone = GeoZoneFactory(level='top')
            topzones.append(zone)
            for _ in range(2):
                subzone = GeoZoneFactory(level='sub', parents=[zone.id])
                subzones.append(subzone)
                for _ in range(2):
                    childzone = GeoZoneFactory(
                        level='child', parents=[zone.id, subzone.id])
                    childzones.append(childzone)

        for zone in topzones + subzones + childzones:
            VisibleDatasetFactory(
                spatial=SpatialCoverageFactory(zones=[zone.id]))

        response = self.get(url_for('api.spatial_coverage', level='sub'))
        self.assert200(response)
        self.assertEqual(len(response.json['features']), len(subzones))

        for feature in response.json['features']:
            self.assertEqual(feature['type'], 'Feature')

            zone = get_by(subzones, 'id', feature['id'])
            self.assertIsNotNone(zone)
            self.assertJsonEqual(feature['geometry'], zone.geom)

            properties = feature['properties']
            self.assertEqual(properties['name'], zone.name)
            self.assertEqual(properties['code'], zone.code)
            self.assertEqual(properties['level'], 'sub')
            # Nested levels datasets should be counted
            self.assertEqual(properties['datasets'], 3)
Example #24
0
    def handle_downloads(self, row, day):
        if 'url' in row:
            try:
                hashed_url = hash_url(row['url'])
                data = (
                    Dataset.objects(resources__urlhash=hashed_url).first()
                    or
                    CommunityResource.objects(urlhash=hashed_url).first()
                )
                if isinstance(data, Dataset):
                    dataset = data
                    resource = get_by(dataset.resources, 'urlhash', hashed_url)
                    log.debug('Found resource download: %s', resource.url)
                    self.count(resource, day, row)
                    metric = ResourceViews(resource)
                    metric.compute()
                    # Use the MongoDB positionnal operator ($)
                    cmd = 'set__resources__S__metrics__{0}'.format(metric.name)
                    qs = Dataset.objects(id=dataset.id,
                                         resources__id=resource.id)
                    qs.update(**{cmd: metric.value})
                    if dataset.organization:
                        OrgResourcesDownloads(dataset.organization).compute()
                elif isinstance(data, CommunityResource):
                    resource = data
                    log.debug('Found community resource download: %s',
                              resource.url)
                    self.count(resource, day, row)
                    metric = CommunityResourceViews(resource)
                    metric.compute()
                    resource.metrics[metric.name] = metric.value
                    resource.save()

            except:
                log.exception('Unable to count download for %s', row['url'])
        if 'subtable' in row:
            for subrow in row['subtable']:
                self.handle_downloads(subrow, day)
Example #25
0
    def test_coverage_for_level(self):
        GeoLevelFactory(id='top')
        GeoLevelFactory(id='sub', parents=['top'])
        GeoLevelFactory(id='child', parents=['sub'])

        topzones, subzones, childzones = [], [], []
        for _ in range(2):
            zone = GeoZoneFactory(level='top')
            topzones.append(zone)
            for _ in range(2):
                subzone = GeoZoneFactory(level='sub', parents=[zone.id])
                subzones.append(subzone)
                for _ in range(2):
                    childzone = GeoZoneFactory(
                        level='child', parents=[zone.id, subzone.id])
                    childzones.append(childzone)

        for zone in topzones + subzones + childzones:
            VisibleDatasetFactory(
                spatial=SpatialCoverageFactory(zones=[zone.id]))

        response = self.get(url_for('api.spatial_coverage', level='sub'))
        self.assert200(response)
        self.assertEqual(len(response.json['features']), len(subzones))

        for feature in response.json['features']:
            self.assertEqual(feature['type'], 'Feature')

            zone = get_by(subzones, 'id', feature['id'])
            self.assertIsNotNone(zone)
            assert_json_equal(feature['geometry'], zone.geom)

            properties = feature['properties']
            self.assertEqual(properties['name'], zone.name)
            self.assertEqual(properties['code'], zone.code)
            self.assertEqual(properties['level'], 'sub')
            # Nested levels datasets should be counted
            self.assertEqual(properties['datasets'], 3)
Example #26
0
    def populate_obj(self, obj, name):
        if not self.has_data:
            return

        initial_values = getattr(obj, name, [])
        new_values = []

        class Holder(object):
            pass

        holder = Holder()

        for idx, field in enumerate(self):
            initial = None
            if hasattr(self.nested_model, 'id') and 'id' in field.data:
                id = self.nested_model.id.to_python(field.data['id'])
                initial = get_by(initial_values, 'id', id)

            holder.nested = initial or self.nested_model()
            field.populate_obj(holder, 'nested')
            new_values.append(holder.nested)

        setattr(obj, name, new_values)
Example #27
0
    def _add_entry(self, formdata=None, data=unset_value, index=None):
        '''
        Fill the form with previous data if necessary to handle partial update
        '''
        if formdata:
            prefix = '-'.join((self.name, str(index)))
            basekey = '-'.join((prefix, '{0}'))
            idkey = basekey.format('id')
            if prefix in formdata:
                formdata[idkey] = formdata.pop(prefix)
            if hasattr(self.nested_model, 'id') and idkey in formdata:
                id = self.nested_model.id.to_python(formdata[idkey])
                data = get_by(self.initial_data, 'id', id)

                initial = flatten_json(self.nested_form, data.to_mongo(),
                                       prefix)

                for key, value in initial.items():
                    if key not in formdata:
                        formdata[key] = value
            else:
                data = None
        return super(NestedModelList, self)._add_entry(formdata, data, index)
 def detect_by_resource_id(self, resource_id, row):
     try:
         # use filter().first() to avoid double matches errors
         dataset = Dataset.objects.filter(resources__id=resource_id).first()
         if not dataset:
             raise Dataset.DoesNotExist
         resource = get_by(dataset.resources, 'id', uuid.UUID(resource_id))
         self.resources.append({
             'dataset': dataset,
             'resource': resource,
             'data': row,
             'latest': True,
         })
     except Dataset.DoesNotExist:
         try:
             resource = CommunityResource.objects.get(id=resource_id)
             self.community_resources.append({
                 'resource': resource,
                 'data': row,
                 'latest': True,
             })
         except CommunityResource.DoesNotExist:
             log.error('No object found for resource_id %s' % resource_id)
Example #29
0
    def populate_obj(self, obj, name):
        if not self.has_data:
            return

        initial_values = getattr(obj, name, [])
        new_values = []

        class Holder(object):
            pass

        holder = Holder()

        for idx, field in enumerate(self):
            initial = None
            if hasattr(self.nested_model, 'id') and 'id' in field.data:
                id = self.nested_model.id.to_python(field.data['id'])
                initial = get_by(initial_values, 'id', id)

            holder.nested = initial or self.nested_model()
            field.populate_obj(holder, 'nested')
            new_values.append(holder.nested)

        setattr(obj, name, new_values)
Example #30
0
    def test_coverage_for_level(self):
        register_level('country', 'included', 'Included level')
        included = [TerritoryFactory(level='included') for _ in range(2)]
        excluded = [TerritoryFactory(level='country') for _ in range(2)]
        [VisibleDatasetFactory(spatial=SpatialCoverageFactory(territories=[t.reference()])) for t in included]
        [VisibleDatasetFactory(spatial=SpatialCoverageFactory(territories=[t.reference()])) for t in excluded]

        response = self.get(url_for('api.spatial_coverage', level='included'))
        self.assert200(response)
        self.assertEqual(len(response.json['features']), 2)

        for feature in response.json['features']:
            self.assertEqual(feature['type'], 'Feature')

            territory = get_by(included, 'id', ObjectId(feature['id']))
            self.assertIsNotNone(territory)
            self.assertEqual(feature['geometry'], territory.geom)

            properties = feature['properties']
            self.assertEqual(properties['name'], territory.name)
            self.assertEqual(properties['code'], territory.code)
            self.assertEqual(properties['level'], 'included')
            self.assertEqual(properties['datasets'], 1)
Example #31
0
    def _add_entry(self, formdata=None, data=unset_value, index=None):
        '''
        Fill the form with previous data if necessary to handle partial update
        '''
        if formdata:
            prefix = '-'.join((self.name, str(index)))
            basekey = '-'.join((prefix, '{0}'))
            idkey = basekey.format('id')
            if prefix in formdata:
                formdata[idkey] = formdata.pop(prefix)
            if hasattr(self.nested_model, 'id') and idkey in formdata:
                id = self.nested_model.id.to_python(formdata[idkey])
                data = get_by(self.initial_data, 'id', id)

                initial = flatten_json(self.nested_form,
                                       data.to_mongo(),
                                       prefix)

                for key, value in initial.items():
                    if key not in formdata:
                        formdata[key] = value
            else:
                data = None
        return super(NestedModelList, self)._add_entry(formdata, data, index)
Example #32
0
def inject_organization_needs(sender, identity):
    if current_user.is_authenticated():
        for org in Organization.objects(members__user=current_user.id):
            membership = get_by(org.members, 'user',
                                current_user._get_current_object())
            identity.provides.add(OrganizationNeed(membership.role, org.id))
Example #33
0
 def get_resource_or_404(self, dataset, id):
     resource = get_by(dataset.resources, 'id', id)
     if not resource:
         api.abort(404, 'Ressource does not exists')
     return resource
Example #34
0
    def process(self, item):
        response = self.get_action('package_show', id=item.remote_id)
        data = self.validate(response['result'], schema)

        # Fix the remote_id: use real ID instead of not stable name
        item.remote_id = data['id']

        # Skip if no resource
        if not len(data.get('resources', [])):
            msg = 'Dataset {0} has no record'.format(item.remote_id)
            raise HarvestSkipException(msg)

        dataset = self.get_dataset(item.remote_id)

        # Core attributes
        if not dataset.slug:
            dataset.slug = data['name']
        dataset.title = data['title']
        dataset.description = data['notes']
        dataset.license = License.objects(id=data['license_id']).first()
        # dataset.license = license or License.objects.get(id='notspecified')
        dataset.tags = [t['name'] for t in data['tags']]

        dataset.created_at = data['metadata_created']
        dataset.last_modified = data['metadata_modified']

        dataset.extras['ckan:name'] = data['name']

        temporal_start, temporal_end = None, None
        spatial_geom = None

        for extra in data['extras']:
            # GeoJSON representation (Polygon or Point)
            if extra['key'] == 'spatial':
                spatial_geom = json.loads(extra['value'])
            #  Textual representation of the extent / location
            elif extra['key'] == 'spatial-text':
                log.debug('spatial-text value not handled')
                print 'spatial-text', extra['value']
            # Linked Data URI representing the place name
            elif extra['key'] == 'spatial-uri':
                log.debug('spatial-uri value not handled')
                print 'spatial-uri', extra['value']
            # Update frequency
            elif extra['key'] == 'frequency':
                print 'frequency', extra['value']
            # Temporal coverage start
            elif extra['key'] == 'temporal_start':
                print 'temporal_start', extra['value']
                temporal_start = daterange_start(extra['value'])
                continue
            # Temporal coverage end
            elif extra['key'] == 'temporal_end':
                print 'temporal_end', extra['value']
                temporal_end = daterange_end(extra['value'])
                continue
            # else:
            #     print extra['key'], extra['value']
            dataset.extras[extra['key']] = extra['value']

        if spatial_geom:
            dataset.spatial = SpatialCoverage()
            if spatial_geom['type'] == 'Polygon':
                coordinates = [spatial_geom['coordinates']]
            elif spatial_geom['type'] == 'MultiPolygon':
                coordinates = spatial_geom['coordinates']
            else:
                HarvestException('Unsupported spatial geometry')
            dataset.spatial.geom = {
                'type': 'MultiPolygon',
                'coordinates': coordinates
            }

        if temporal_start and temporal_end:
            dataset.temporal_coverage = db.DateRange(
                start=temporal_start,
                end=temporal_end,
            )

        # Remote URL
        if data.get('url'):
            dataset.extras['remote_url'] = data['url']

        # Resources
        for res in data['resources']:
            if res['resource_type'] not in ALLOWED_RESOURCE_TYPES:
                continue
            try:
                resource = get_by(dataset.resources, 'id', UUID(res['id']))
            except:
                log.error('Unable to parse resource ID %s', res['id'])
                continue
            if not resource:
                resource = Resource(id=res['id'])
                dataset.resources.append(resource)
            resource.title = res.get('name', '') or ''
            resource.description = res.get('description')
            resource.url = res['url']
            resource.filetype = ('api' if res['resource_type'] == 'api'
                                 else 'remote')
            resource.format = res.get('format')
            resource.mime = res.get('mimetype')
            resource.hash = res.get('hash')
            resource.created = res['created']
            resource.modified = res['last_modified']
            resource.published = resource.published or resource.created

        return dataset
Example #35
0
 def test_attribute_not_found(self):
     '''get_by() should not fail if an object don't have the given attr'''
     assert get_by(TEST_LIST, 'inexistant', 'value') is None
Example #36
0
 def test_attribute_not_found(self):
     '''get_by() should not fail if an object don't have the given attr'''
     self.assertIsNone(get_by(TEST_LIST, 'inexistant', 'value'))
Example #37
0
 def test_not_found(self):
     '''get_by() should return None if not found'''
     self.assertIsNone(get_by(TEST_LIST, 'name', 'not-found'))
Example #38
0
 def test_find_object(self):
     '''get_by() should find an object in list'''
     obj_lst = [ObjDict(d) for d in TEST_LIST]
     result = get_by(obj_lst, 'name', 'bbb')
     self.assertEqual(result.name, 'bbb')
     self.assertEqual(result.another, 'ddd')
Example #39
0
 def test_find_dict(self):
     '''get_by() should find a dictionnary in list'''
     result = get_by(TEST_LIST, 'name', 'bbb')
     self.assertEqual(result, {'name': 'bbb', 'another': 'ddd'})
Example #40
0
def inject_organization_needs(sender, identity):
    if current_user.is_authenticated():
        for org in Organization.objects(members__user=current_user.id):
            membership = get_by(org.members, 'user', current_user._get_current_object())
            identity.provides.add(OrganizationNeed(membership.role, org.id))
Example #41
0
    def process(self, item):
        response = self.get_action('package_show', id=item.remote_id)
        data = self.validate(response['result'], schema)

        # Fix the remote_id: use real ID instead of not stable name
        item.remote_id = data['id']

        # Skip if no resource
        if not len(data.get('resources', [])):
            msg = 'Dataset {0} has no record'.format(item.remote_id)
            raise HarvestSkipException(msg)

        dataset = self.get_dataset(item.remote_id)

        # Core attributes
        if not dataset.slug:
            dataset.slug = data['name']
        dataset.title = data['title']
        dataset.description = data['notes']
        dataset.license = License.objects(id=data['license_id']).first()
        # dataset.license = license or License.objects.get(id='notspecified')
        dataset.tags = [t['name'] for t in data['tags'] if t['name']]

        dataset.created_at = data['metadata_created']
        dataset.last_modified = data['metadata_modified']

        dataset.extras['ckan:name'] = data['name']

        temporal_start, temporal_end = None, None
        spatial_geom = None

        for extra in data['extras']:
            # GeoJSON representation (Polygon or Point)
            if extra['key'] == 'spatial':
                spatial_geom = json.loads(extra['value'])
            #  Textual representation of the extent / location
            elif extra['key'] == 'spatial-text':
                log.debug('spatial-text value not handled')
                print 'spatial-text', extra['value']
            # Linked Data URI representing the place name
            elif extra['key'] == 'spatial-uri':
                log.debug('spatial-uri value not handled')
                print 'spatial-uri', extra['value']
            # Update frequency
            elif extra['key'] == 'frequency':
                print 'frequency', extra['value']
            # Temporal coverage start
            elif extra['key'] == 'temporal_start':
                print 'temporal_start', extra['value']
                temporal_start = daterange_start(extra['value'])
                continue
            # Temporal coverage end
            elif extra['key'] == 'temporal_end':
                print 'temporal_end', extra['value']
                temporal_end = daterange_end(extra['value'])
                continue
            # else:
            #     print extra['key'], extra['value']
            dataset.extras[extra['key']] = extra['value']

        if spatial_geom:
            dataset.spatial = SpatialCoverage()
            if spatial_geom['type'] == 'Polygon':
                coordinates = [spatial_geom['coordinates']]
            elif spatial_geom['type'] == 'MultiPolygon':
                coordinates = spatial_geom['coordinates']
            else:
                HarvestException('Unsupported spatial geometry')
            dataset.spatial.geom = {
                'type': 'MultiPolygon',
                'coordinates': coordinates
            }

        if temporal_start and temporal_end:
            dataset.temporal_coverage = db.DateRange(
                start=temporal_start,
                end=temporal_end,
            )

        # Remote URL
        if data.get('url'):
            dataset.extras['remote_url'] = data['url']

        # Resources
        for res in data['resources']:
            if res['resource_type'] not in ALLOWED_RESOURCE_TYPES:
                continue
            try:
                resource = get_by(dataset.resources, 'id', UUID(res['id']))
            except:
                log.error('Unable to parse resource ID %s', res['id'])
                continue
            if not resource:
                resource = Resource(id=res['id'])
                dataset.resources.append(resource)
            resource.title = res.get('name', '') or ''
            resource.description = res.get('description')
            resource.url = res['url']
            resource.filetype = ('api' if res['resource_type'] == 'api'
                                 else 'remote')
            resource.format = res.get('format')
            resource.mime = res.get('mimetype')
            resource.hash = res.get('hash')
            resource.created = res['created']
            resource.modified = res['last_modified']
            resource.published = resource.published or resource.created

        return dataset
Example #42
0
    def remote_datasets(self):
        response = self.get('package_list')
        for name in response['result']:
            details = self.get('package_show', {'id': name})['result']
            dataset = self.get_harvested(Dataset, details['id'])

            # Core attributes
            dataset.slug = details['name']
            dataset.title = details['title']
            dataset.description = details.get('notes', 'No description')
            dataset.license = License.objects(id=details['license_id']).first() or License.objects.get(id='notspecified')
            dataset.tags = [tag['name'].lower() for tag in details['tags']]

            dataset.frequency = self.map('frequency', details) or 'unknown'
            dataset.created_at = parse(details['metadata_created'])
            dataset.last_modified = parse(details['metadata_modified'])

            if any_field(details, 'territorial_coverage', 'territorial_coverage_granularity'):
                coverage = TerritorialCoverage(
                    codes=[code.strip() for code in details.get('territorial_coverage', '').split(',') if code.strip()],
                    granularity=self.map('territorial_coverage_granularity', details),
                )
                dataset.extras['territorial_coverage'] = coverage
                try:
                    dataset.spatial = territorial_to_spatial(dataset)
                except Exception as e:
                    print 'Error while processing spatial coverage for {0}:'.format(dataset.title), e

            if all_field(details, 'temporal_coverage_from', 'temporal_coverage_to'):
                try:
                    dataset.temporal_coverage = db.DateRange(
                        start=daterange_start(details.get('temporal_coverage_from')),
                        end=daterange_end(details.get('temporal_coverage_to')),
                    )
                except:
                    log.error('Unable to parse temporal coverage for dataset %s', details['id'])

            # Organization
            if details.get('organization'):
                dataset.organization = self.get_harvested(Organization, details['organization']['id'], False)
            else:
                # Need to fetch user from roles
                roles = self.get('roles_show', {'domain_object': name})['result']['roles']
                for role in roles:
                    if role['role'] == 'admin' and role['context'] == 'Package':
                        dataset.owner = self.get_harvested(User, role['user_id'])
                        break

            # Supplier
            if details.get('supplier_id'):
                dataset.supplier = self.get_harvested(Organization, details['supplier_id'], False)

            # Remote URL
            if details.get('url'):
                dataset.extras['remote_url'] = details['url']

            # Extras
            if 'extras' in details:
                extra_mapping = self.harvester.mapping.get('from_extras', {})
                for extra in details['extras']:
                    if extra['key'] in self.harvester.mapping:
                        value = self.harvester.mapping[extra['key']].get(extra['value'])
                    else:
                        value = extra['value']
                    if extra['key'] in extra_mapping:
                        setattr(dataset, extra_mapping[extra['key']], value)
                    else:
                        dataset.extras[extra['key']] = value

            # Resources
            for res in details['resources']:
                try:
                    resource = get_by(dataset.resources, 'id', UUID(res['id']))
                except:
                    log.error('Unable to parse resource %s', res['id'])
                    continue
                if not resource:
                    resource = Resource(id=res['id'])
                    dataset.resources.append(resource)
                resource.title = res.get('name', '') or ''
                resource.url = res['url']
                resource.description = res.get('description')
                resource.format = res.get('format')
                resource.hash = res.get('hash')
                resource.created = parse(res['created'])
                resource.modified = parse(res['revision_timestamp'])
                resource.published = resource.published or resource.created
            yield dataset

            if dataset.id:
                followers = self.get('dataset_follower_list', {'id': name})['result']
                for follower in followers:
                    user = self.get_harvested(User, follower['id'], False)
                    if user:
                        follow, created = FollowDataset.objects.get_or_create(follower=user, following=dataset)
Example #43
0
File: api.py Project: seiteta/udata
 def get_resource_or_404(self, dataset, id):
     resource = get_by(dataset.resources, 'id', id)
     if not resource:
         api.abort(404, 'Ressource does not exists')
     return resource
Example #44
0
 def test_find_dict(self):
     '''get_by() should find a dictionnary in list'''
     result = get_by(TEST_LIST, 'name', 'bbb')
     assert result == {'name': 'bbb', 'another': 'ddd'}
Example #45
0
    def process(self, item):
        response = self.get(item.remote_id)
        encoding = chardet.detect(response.content)['encoding']
        xml = self.parse_xml(response.content.decode(encoding))
        metadata = xml['metadata']

        # Resolve and remote id from metadata
        item.remote_id = metadata['id']
        dataset = self.get_dataset(metadata['id'])

        dataset.title = metadata['title']
        dataset.frequency = FREQUENCIES.get(metadata['frequency'], 'unknown')
        dataset.description = metadata['notes']
        dataset.private = metadata['private']
        dataset.tags = sorted(set(metadata['tags']))

        if metadata.get('license_id'):
            dataset.license = License.objects.get(id=metadata['license_id'])

        if (metadata.get('temporal_coverage_from')
                and metadata.get('temporal_coverage_to')):
            dataset.temporal_coverage = db.DateRange(
                start=metadata['temporal_coverage_from'],
                end=metadata['temporal_coverage_to'])

        if (metadata.get('territorial_coverage_code')
                or metadata.get('territorial_coverage_granularity')):
            dataset.spatial = SpatialCoverage()

            if metadata.get('territorial_coverage_granularity'):
                dataset.spatial.granularity = GRANULARITIES.get(
                    metadata['territorial_coverage_granularity'])

            if metadata.get('territorial_coverage_code'):
                dataset.spatial.zones = [
                    ZONES[metadata['territorial_coverage_code']]
                ]

        dataset.resources = []
        cle = get_by(metadata['resources'], 'format', 'cle')
        for row in metadata['resources']:
            if row['format'] == 'cle':
                continue
            else:
                resource = Resource(title=row['name'],
                                    description=(row['description'] + '\n\n' +
                                                 SSL_COMMENT).strip(),
                                    filetype='remote',
                                    url=row['url'],
                                    format=row['format'])
                if resource.format == 'csv' and cle:
                    resource.checksum = Checksum(type='sha256',
                                                 value=self.get(
                                                     cle['url']).text)
                if row.get('last_modified'):
                    resource.modified = row['last_modified']
                dataset.resources.append(resource)

        if metadata.get('author'):
            dataset.extras['author'] = metadata['author']
        if metadata.get('author_email'):
            dataset.extras['author_email'] = metadata['author_email']
        if metadata.get('maintainer'):
            dataset.extras['maintainer'] = metadata['maintainer']
        if metadata.get('maintainer_email'):
            dataset.extras['maintainer_email'] = metadata['maintainer_email']
        for extra in metadata['extras']:
            dataset.extras[extra['key']] = extra['value']

        return dataset
Example #46
0
 def get_resource(self, dataset, url):
     resource = get_by(dataset.resources, 'url', url)
     if not resource:
         return True, Resource(url=url)
     return False, resource
Example #47
0
 def test_find_object(self):
     '''get_by() should find an object in list'''
     obj_lst = [ObjDict(d) for d in TEST_LIST]
     result = get_by(obj_lst, 'name', 'bbb')
     assert result.name == 'bbb'
     assert result.another == 'ddd'
Example #48
0
 def test_not_found(self):
     '''get_by() should return None if not found'''
     assert get_by(TEST_LIST, 'name', 'not-found') is None
Example #49
0
    def process(self, item):
        response = self.get(item.remote_id)
        encoding = chardet.detect(response.content)['encoding']
        xml = self.parse_xml(response.content.decode(encoding))
        metadata = xml['metadata']

        # Resolve and remote id from metadata
        item.remote_id = metadata['id']
        dataset = self.get_dataset(metadata['id'])

        dataset.title = metadata['title']
        dataset.frequency = FREQUENCIES.get(metadata['frequency'], 'unknown')
        dataset.description = metadata['notes']
        dataset.private = metadata['private']
        dataset.tags = sorted(set(metadata['tags']))

        if metadata.get('license_id'):
            dataset.license = License.objects.get(id=metadata['license_id'])

        if (metadata.get('temporal_coverage_from') and
                metadata.get('temporal_coverage_to')):
            dataset.temporal_coverage = db.DateRange(
                start=metadata['temporal_coverage_from'],
                end=metadata['temporal_coverage_to']
            )

        if (metadata.get('territorial_coverage_code') or
                metadata.get('territorial_coverage_granularity')):
            dataset.spatial = SpatialCoverage()

            if metadata.get('territorial_coverage_granularity'):
                dataset.spatial.granularity = GRANULARITIES.get(
                    metadata['territorial_coverage_granularity'])

            if metadata.get('territorial_coverage_code'):
                dataset.spatial.zones = [
                    ZONES[metadata['territorial_coverage_code']]]

        dataset.resources = []
        cle = get_by(metadata['resources'], 'format', 'cle')
        for row in metadata['resources']:
            if row['format'] == 'cle':
                continue
            else:
                resource = Resource(
                    title=row['name'],
                    description=(
                        row['description'] + '\n\n' + SSL_COMMENT).strip(),
                    filetype='remote',
                    url=row['url'],
                    format=row['format']
                )
                if resource.format == 'csv' and cle:
                    resource.checksum = Checksum(
                        type='sha256', value=self.get(cle['url']).text)
                if row.get('last_modified'):
                    resource.modified = row['last_modified']
                dataset.resources.append(resource)

        if metadata.get('author'):
            dataset.extras['author'] = metadata['author']
        if metadata.get('author_email'):
            dataset.extras['author_email'] = metadata['author_email']
        if metadata.get('maintainer'):
            dataset.extras['maintainer'] = metadata['maintainer']
        if metadata.get('maintainer_email'):
            dataset.extras['maintainer_email'] = metadata['maintainer_email']
        for extra in metadata['extras']:
            dataset.extras[extra['key']] = extra['value']

        return dataset