Example #1
0
def preview_from_config(
    name,
    url,
    backend,
    description=None,
    frequency=DEFAULT_HARVEST_FREQUENCY,
    owner=None,
    organization=None,
    config=None,
):
    '''Preview an harvesting from a source created with the given parameters'''
    if owner and not isinstance(owner, User):
        owner = User.get(owner)

    if organization and not isinstance(organization, Organization):
        organization = Organization.get(organization)

    source = HarvestSource(
        name=name,
        url=url,
        backend=backend,
        description=description,
        frequency=frequency or DEFAULT_HARVEST_FREQUENCY,
        owner=owner,
        organization=organization,
        config=config,
    )
    cls = backends.get(current_app, source.backend)
    max_items = current_app.config['HARVEST_PREVIEW_MAX_ITEMS']
    backend = cls(source, dryrun=True, max_items=max_items)
    return backend.harvest()
Example #2
0
def create_source(name, url, backend,
                  description=None,
                  frequency=DEFAULT_HARVEST_FREQUENCY,
                  owner=None,
                  organization=None,
                  config=None,
                  ):
    '''Create a new harvest source'''
    if owner and not isinstance(owner, User):
        owner = User.get(owner)

    if organization and not isinstance(organization, Organization):
        organization = Organization.get(organization)

    source = HarvestSource.objects.create(
        name=name,
        url=url,
        backend=backend,
        description=description,
        frequency=frequency or DEFAULT_HARVEST_FREQUENCY,
        owner=owner,
        organization=organization,
        config=config,
    )
    signals.harvest_source_created.send(source)
    return source
Example #3
0
def create_source(
    name,
    url,
    backend,
    description=None,
    frequency=DEFAULT_HARVEST_FREQUENCY,
    owner=None,
    organization=None,
    config=None,
    active=None,
    autoarchive=None,
):
    '''Create a new harvest source'''
    if owner and not isinstance(owner, User):
        owner = User.get(owner)

    if organization and not isinstance(organization, Organization):
        organization = Organization.get(organization)

    source = HarvestSource.objects.create(
        name=name,
        url=url,
        backend=backend,
        description=description,
        frequency=frequency or DEFAULT_HARVEST_FREQUENCY,
        owner=owner,
        organization=organization,
        config=config,
        active=active,
        autoarchive=autoarchive,
    )
    signals.harvest_source_created.send(source)
    return source
Example #4
0
 def get_value(self):
     org = (Organization.objects(metrics__reuses__gt=0).visible()
            .order_by('-metrics.reuses').first())
     if org:
         return org.metrics.get('reuses', 0)
     else:
         return 0
Example #5
0
def preview_from_config(name, url, backend,
                        description=None,
                        frequency=DEFAULT_HARVEST_FREQUENCY,
                        owner=None,
                        organization=None,
                        config=None,
                        ):
    '''Preview an harvesting from a source created with the given parameters'''
    if owner and not isinstance(owner, User):
        owner = User.get(owner)

    if organization and not isinstance(organization, Organization):
        organization = Organization.get(organization)

    source = HarvestSource(
        name=name,
        url=url,
        backend=backend,
        description=description,
        frequency=frequency or DEFAULT_HARVEST_FREQUENCY,
        owner=owner,
        organization=organization,
        config=config,
    )
    cls = backends.get(current_app, source.backend)
    max_items = current_app.config['HARVEST_PREVIEW_MAX_ITEMS']
    backend = cls(source, dryrun=True, max_items=max_items)
    return backend.harvest()
Example #6
0
    def test_purge_organizations(self):
        with self.autoindex():
            org = Organization.objects.create(name='delete me',
                                              description='XXX')
            resources = [ResourceFactory() for _ in range(2)]
            dataset = DatasetFactory(resources=resources, organization=org)

        # Upload organization's logo
        file = create_test_image()
        user = AdminFactory()
        self.login(user)
        response = self.post(url_for('api.organization_logo', org=org),
                             {'file': (file, 'test.png')},
                             json=False)
        self.assert200(response)

        # Delete organization
        response = self.delete(url_for('api.organization', org=org))
        self.assert204(response)

        tasks.purge_organizations()

        # Check organization's logo is deleted
        self.assertEqual(list(storages.avatars.list_files()), [])

        dataset = Dataset.objects(id=dataset.id).first()
        self.assertIsNone(dataset.organization)

        organization = Organization.objects(name='delete me').first()
        self.assertIsNone(organization)

        indexed_dataset = DatasetSearch.get(id=dataset.id,
                                            using=es.client,
                                            index=es.index_name)
        self.assertIsNone(indexed_dataset.organization)
Example #7
0
    def mongo_search(cls, args):
        orgs = Organization.objects(deleted=None)
        orgs = OrgApiParser.parse_filters(orgs, args)

        sort = cls.parse_sort(args['sort']) or ('$text_score' if args['q'] else
                                                None) or DEFAULT_SORTING
        offset = (args['page'] - 1) * args['page_size']
        return orgs.order_by(sort).skip(offset).limit(
            args['page_size']), orgs.count()
Example #8
0
def climate_change_challenge():
    partners = Organization.objects(slug__in=C3_PARTNERS)
    datasets = (Dataset.objects(
        badges__kind=C3).visible().order_by('-metrics.followers'))
    return theme.render('c3.html',
                        partners=partners,
                        datasets=datasets,
                        badge=C3,
                        nb_displayed_datasets=NB_DISPLAYED_DATASETS)
Example #9
0
def climate_change_challenge():
    partners = Organization.objects(slug__in=C3_PARTNERS)
    datasets = (Dataset.objects(badges__kind=C3).visible()
                .order_by('-metrics.followers'))
    return theme.render('c3.html',
                        partners=partners,
                        datasets=datasets,
                        badge=C3,
                        nb_displayed_datasets=NB_DISPLAYED_DATASETS)
Example #10
0
    def test_purge_organizations(self):
        org = Organization.objects.create(name='delete me', description='XXX')
        resources = [ResourceFactory() for _ in range(2)]
        dataset = DatasetFactory(resources=resources, organization=org)

        # Upload organization's logo
        file = create_test_image()
        user = AdminFactory()
        self.login(user)
        response = self.post(
            url_for('api.organization_logo', org=org),
            {'file': (file, 'test.png')},
            json=False)
        self.assert200(response)

        transfer_to_org = Transfer.objects.create(
            owner=user,
            recipient=org,
            subject=dataset,
            comment='comment',
        )
        transfer_from_org = Transfer.objects.create(
            owner=org,
            recipient=user,
            subject=dataset,
            comment='comment',
        )

        oauth_client = OAuth2Client.objects.create(
            name='test-client',
            owner=user,
            organization=org,
            redirect_uris=['https://test.org/callback'],
        )

        # Delete organization
        response = self.delete(url_for('api.organization', org=org))
        self.assert204(response)

        tasks.purge_organizations()

        oauth_client.reload()
        assert oauth_client.organization is None

        assert Transfer.objects.filter(id=transfer_from_org.id).count() == 0
        assert Transfer.objects.filter(id=transfer_to_org.id).count() == 0

        # Check organization's logo is deleted
        self.assertEqual(list(storages.avatars.list_files()), [])

        dataset = Dataset.objects(id=dataset.id).first()
        self.assertIsNone(dataset.organization)

        organization = Organization.objects(name='delete me').first()
        self.assertIsNone(organization)
Example #11
0
def purge_organizations():
    for organization in Organization.objects(deleted__ne=None):
        log.info('Purging organization "{0}"'.format(organization))
        # Remove followers
        FollowOrg.objects(following=organization).delete()
        # Remove activity
        Activity.objects(related_to=organization).delete()
        Activity.objects(organization=organization).delete()
        # Remove metrics
        Metrics.objects(object_id=organization.id).delete()
        organization.delete()
Example #12
0
def purge_organizations(self):
    for organization in Organization.objects(deleted__ne=None):
        log.info('Purging organization "{0}"'.format(organization))
        # Remove followers
        FollowOrg.objects(following=organization).delete()
        # Remove activity
        Activity.objects(related_to=organization).delete()
        Activity.objects(organization=organization).delete()
        # Remove metrics
        Metrics.objects(object_id=organization.id).delete()
        # Remove
        organization.delete()
Example #13
0
def purge_organizations(self):
    for organization in Organization.objects(deleted__ne=None):
        log.info('Purging organization "{0}"'.format(organization))
        # Remove followers
        Follow.objects(following=organization).delete()
        # Remove activity
        Activity.objects(related_to=organization).delete()
        Activity.objects(organization=organization).delete()
        # Remove metrics
        Metrics.objects(object_id=organization.id).delete()
        # Store datasets for later reindexation
        d_ids = [d.id for d in Dataset.objects(organization=organization)]
        # Remove
        organization.delete()
        # Reindex the datasets that were linked to the organization
        for dataset in Dataset.objects(id__in=d_ids):
            reindex(dataset)
Example #14
0
def create_source(name, url, backend, frequency=DEFAULT_HARVEST_FREQUENCY, owner=None, org=None):
    '''Create a new harvest source'''
    if owner and not isinstance(owner, User):
        owner = User.get(owner)

    if org and not isinstance(org, Organization):
        org = Organization.get(org)

    source = HarvestSource.objects.create(
        name=name,
        url=url,
        backend=backend,
        frequency=frequency or DEFAULT_HARVEST_FREQUENCY,
        owner=owner,
        organization=org
    )
    signals.harvest_source_created.send(source)
    return source
Example #15
0
    def serialize(cls, reuse):
        """By default use the ``to_dict`` method

        and exclude ``_id``, ``_cls`` and ``owner`` fields.
        """
        datasets = Dataset.objects(id__in=[r.id for r in reuse.datasets])
        datasets = list(datasets.only('id', 'title').no_dereference())
        organization = None
        owner = None
        if reuse.organization:
            organization = Organization.objects(
                id=reuse.organization.id).first()
        elif reuse.owner:
            owner = User.objects(id=reuse.owner.id).first()
        return {
            'title': reuse.title,
            'description': reuse.description,
            'url': reuse.url,
            'organization': str(organization.id) if organization else None,
            'owner': str(owner.id) if owner else None,
            'type': reuse.type,
            'topic': reuse.topic,
            'tags': reuse.tags,
            'tag_suggest': reuse.tags,
            'badges': [badge.kind for badge in reuse.badges],
            'created': to_iso_datetime(reuse.created_at),
            'last_modified': to_iso_datetime(reuse.last_modified),
            'dataset': [{
                'id': str(d.id),
                'title': d.title
            } for d in datasets],
            'metrics': reuse.metrics,
            'featured': reuse.featured,
            'extras': reuse.extras,
            'reuse_suggest': {
                'input': cls.completer_tokenize(reuse.title) + [reuse.id],
                'output': str(reuse.id),
                'payload': {
                    'title': reuse.title,
                    'slug': reuse.slug,
                    'image_url': reuse.image(500, external=True),
                },
            },
        }
Example #16
0
    def test_purge_organizations(self):
        with self.autoindex():
            org = Organization.objects.create(
                name='delete me', deleted='2016-01-01', description='XXX')
            resources = [ResourceFactory() for _ in range(2)]
            dataset = DatasetFactory(resources=resources, organization=org)

        tasks.purge_organizations()

        dataset = Dataset.objects(id=dataset.id).first()
        self.assertIsNone(dataset.organization)

        organization = Organization.objects(name='delete me').first()
        self.assertIsNone(organization)

        indexed_dataset = DatasetSearch.get(id=dataset.id,
                                            using=es.client,
                                            index=es.index_name)
        self.assertIsNone(indexed_dataset.organization)
Example #17
0
    def serialize(cls, reuse):
        """By default use the ``to_dict`` method

        and exclude ``_id``, ``_cls`` and ``owner`` fields.
        """
        datasets = Dataset.objects(id__in=[r.id for r in reuse.datasets])
        datasets = list(datasets.only('id', 'title').no_dereference())
        organization = None
        owner = None
        if reuse.organization:
            organization = Organization.objects(id=reuse.organization.id).first()
        elif reuse.owner:
            owner = User.objects(id=reuse.owner.id).first()
        return {
            'title': reuse.title,
            'description': reuse.description,
            'url': reuse.url,
            'organization': str(organization.id) if organization else None,
            'owner': str(owner.id) if owner else None,
            'type': reuse.type,
            'tags': reuse.tags,
            'tag_suggest': reuse.tags,
            'badges': [badge.kind for badge in reuse.badges],
            'created': reuse.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified': reuse.last_modified.strftime('%Y-%m-%dT%H:%M:%S'),
            'dataset': [{
                'id': str(d.id),
                'title': d.title
            } for d in datasets],
            'metrics': reuse.metrics,
            'featured': reuse.featured,
            'extras': reuse.extras,
            'reuse_suggest': {
                'input': cls.completer_tokenize(reuse.title) + [reuse.id],
                'output': str(reuse.id),
                'payload': {
                    'title': reuse.title,
                    'slug': reuse.slug,
                    'image_url': reuse.image(40, external=True),
                },
            },
        }
Example #18
0
    def test_purge_organizations(self):
        with self.autoindex():
            org = Organization.objects.create(name='delete me',
                                              deleted='2016-01-01',
                                              description='XXX')
            resources = [ResourceFactory() for _ in range(2)]
            dataset = DatasetFactory(resources=resources, organization=org)

        tasks.purge_organizations()

        dataset = Dataset.objects(id=dataset.id).first()
        self.assertEqual(dataset.organization, None)

        organization = Organization.objects(name='delete me').first()
        self.assertEqual(organization, None)

        indexed_dataset = DatasetSearch.get(id=dataset.id,
                                            using=es.client,
                                            index=es.index_name)
        self.assertEqual(indexed_dataset.organization, '')
Example #19
0
 def get_value(self):
     org = (Organization.objects(metrics__datasets__gt=0).visible().
            order_by('-metrics.datasets').first())
     return org.metrics['datasets'] if org else 0
Example #20
0
 def get_value(self):
     org = (Organization.objects(metrics__followers__gt=0).visible().
            order_by('-metrics.followers').first())
     return org.metrics['followers'] if org else 0
Example #21
0
 def get_context(self):
     context = super(UserView, self).get_context()
     context['organizations'] = Organization.objects(
         members__user=self.user)
     return context
Example #22
0
 def get_context(self):
     context = super(UserView, self).get_context()
     context['organizations'] = Organization.objects(
         members__user=self.user)
     return context
Example #23
0
    def serialize(cls, dataset):
        organization = None
        owner = None
        image_url = None
        spatial_weight = DEFAULT_SPATIAL_WEIGHT
        temporal_weight = DEFAULT_TEMPORAL_WEIGHT

        if dataset.organization:
            organization = Organization.objects(id=dataset.organization.id).first()
            image_url = organization.logo(40, external=True)
        elif dataset.owner:
            owner = User.objects(id=dataset.owner.id).first()
            image_url = owner.avatar(40, external=True)

        certified = organization and organization.certified

        document = {
            'title': dataset.title,
            'description': dataset.description,
            'license': getattr(dataset.license, 'id', None),
            'tags': dataset.tags,
            'badges': [badge.kind for badge in dataset.badges],
            'tag_suggest': dataset.tags,
            'resources': [
                {
                    'title': r.title,
                    'description': r.description,
                    'format': r.format,
                    'type': r.type,
                }
                for r in dataset.resources],
            'format_suggest': [r.format.lower()
                               for r in dataset.resources
                               if r.format],
            'frequency': dataset.frequency,
            'organization': str(organization.id) if organization else None,
            'owner': str(owner.id) if owner else None,
            'dataset_suggest': {
                'input': cls.completer_tokenize(dataset.title) + [str(dataset.id)],
                'output': dataset.title,
                'payload': {
                    'id': str(dataset.id),
                    'slug': dataset.slug,
                    'acronym': dataset.acronym,
                    'image_url': image_url,
                },
            },
            'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified': dataset.last_modified.strftime(
                '%Y-%m-%dT%H:%M:%S'),
            'metrics': dataset.metrics,
            'featured': dataset.featured,
            'from_certified': certified,
        }
        if (dataset.temporal_coverage is not None and
                dataset.temporal_coverage.start and
                dataset.temporal_coverage.end):
            start = dataset.temporal_coverage.start.toordinal()
            end = dataset.temporal_coverage.end.toordinal()
            temporal_weight = min((end - start) / 365, MAX_TEMPORAL_WEIGHT)
            document.update({
                'temporal_coverage': {'start': start, 'end': end},
                'temporal_weight': temporal_weight,
            })

        if dataset.spatial is not None:
            # Index precise zone labels and parents zone identifiers
            # to allow fast filtering.
            zone_ids = [z.id for z in dataset.spatial.zones]
            zones = GeoZone.objects(id__in=zone_ids).exclude('geom')
            parents = set()
            geozones = []
            coverage_level = ADMIN_LEVEL_MAX
            for zone in zones:
                geozones.append({
                    'id': zone.id,
                    'name': zone.name,
                    'keys': zone.keys_values
                })
                parents |= set(zone.parents)
                coverage_level = min(coverage_level, admin_levels[zone.level])

            geozones.extend([{'id': p} for p in parents])

            spatial_weight = ADMIN_LEVEL_MAX / coverage_level
            document.update({
                'geozones': geozones,
                'granularity': dataset.spatial.granularity,
                'spatial_weight': spatial_weight,
            })

        document['dataset_suggest']['weight'] = cls.get_suggest_weight(
            temporal_weight, spatial_weight, dataset.featured)

        if dataset.acronym:
            document['dataset_suggest']['input'].append(dataset.acronym)

        return document
Example #24
0
 def get_value(self):
     return Organization.objects(badges__kind=PUBLIC_SERVICE).count()
Example #25
0
 def get_value(self):
     org = Organization.objects(metrics__datasets__gt=0).visible().order_by('-metrics.datasets').first()
     return org.metrics.get('datasets', 0)
Example #26
0
 def get_context(self):
     context = super(UserView, self).get_context()
     context["organizations"] = Organization.objects(members__user=self.user)
     for item in navbar.items:
         item._args = {"user": self.user}
     return context
Example #27
0
 def get_value(self):
     return Organization.objects(badges__kind=PUBLIC_SERVICE).count()
Example #28
0
 def get_value(self):
     org = (Organization.objects(metrics__followers__gt=0).visible()
            .order_by('-metrics.followers').first())
     return org.metrics['followers'] if org else 0
Example #29
0
def set_g_user_orgs():
    if current_user.is_authenticated():
        g.user_organizations = Organization.objects(members__user=current_user.id)
Example #30
0
    def process(self, item):
        ods_dataset = item.kwargs['dataset']
        dataset_id = ods_dataset['datasetid']
        ods_metadata = ods_dataset['metas']
        ods_interopmetas = ods_dataset.get('interop_metas', {})

        if not ods_dataset.get('has_records'):
            msg = 'Dataset {datasetid} has no record'.format(**ods_dataset)
            raise HarvestSkipException(msg)

        if 'inspire' in ods_interopmetas and not self.has_feature('inspire'):
            msg = 'Dataset {datasetid} has INSPIRE metadata'
            raise HarvestSkipException(msg.format(**ods_dataset))

        dataset = self.get_dataset(item.remote_id)

        dataset.title = ods_metadata['title']
        dataset.frequency = 'unknown'
        description = ods_metadata.get('description', '').strip()
        dataset.description = parse_html(description)
        dataset.private = False

        # Detect Organization
        try:
            organization_acronym = ods_metadata['publisher']
        except KeyError:
            pass
        else:
            orgObj = Organization.objects(acronym=organization_acronym).first()
            if orgObj:
                dataset.organization = orgObj
            else:
                orgObj = Organization()
                orgObj.acronym = organization_acronym
                orgObj.name = organization_acronym
                orgObj.description = organization_acronym
                orgObj.save()

                dataset.organization = orgObj

        tags = set()
        if 'keyword' in ods_metadata:
            if isinstance(ods_metadata['keyword'], list):
                tags |= set(ods_metadata['keyword'])
            else:
                tags.add(ods_metadata['keyword'])

        if 'theme' in ods_metadata:
            if isinstance(ods_metadata['theme'], list):
                for theme in ods_metadata['theme']:
                    tags.update([t.strip().lower() for t in theme.split(',')])
            else:
                themes = ods_metadata['theme'].split(',')
                tags.update([t.strip().lower() for t in themes])

        dataset.tags = list(tags)
        dataset.tags.append(urlparse(self.source.url).hostname)

        # Detect license
        default_license = dataset.license or License.default()
        license_id = ods_metadata.get('license')
        dataset.license = License.guess(license_id,
                                        self.LICENSES.get(license_id),
                                        default=default_license)

        self.process_resources(dataset, ods_dataset, ('csv', 'json'))

        if 'geo' in ods_dataset['features']:
            exports = ['geojson']
            if ods_metadata['records_count'] <= self.SHAPEFILE_RECORDS_LIMIT:
                exports.append('shp')
            self.process_resources(dataset, ods_dataset, exports)

        self.process_extra_files(dataset, ods_dataset, 'alternative_export')
        self.process_extra_files(dataset, ods_dataset, 'attachment')

        dataset.extras['ods:url'] = self.explore_url(dataset_id)
        dataset.extras['harvest:name'] = self.source.name
        
        if 'references' in ods_metadata:
            dataset.extras['ods:references'] = ods_metadata['references']
        dataset.extras['ods:has_records'] = ods_dataset['has_records']
        dataset.extras['ods:geo'] = 'geo' in ods_dataset['features']

        return dataset
Example #31
0
 def get_value(self):
     org = Organization.objects(metrics__followers__gt=0).visible(
     ).order_by('-metrics.followers').first()
     return org.metrics.get('followers', 0)
Example #32
0
 def get_value(self):
     org = Organization.objects(metrics__followers__gt=0).visible().order_by('-metrics.followers').first()
     return org.metrics.get('followers', 0)
Example #33
0
def inject_organization_needs(sender, identity):
    if current_user.is_authenticated():
        for org in Organization.objects(members__user=current_user.id):
            membership = get_by(org.members, 'user',
                                current_user._get_current_object())
            identity.provides.add(OrganizationNeed(membership.role, org.id))
Example #34
0
 def get_value(self):
     org = Organization.objects(metrics__datasets__gt=0).visible().order_by(
         '-metrics.datasets').first()
     return org.metrics.get('datasets', 0)
Example #35
0
    def process(self, item):
        response = self.get_action('package_show', id=item.remote_id)
        data = self.validate(response['result'], self.schema)

        if type(data) == list:
            data = data[0]

        # Fix the remote_id: use real ID instead of not stable name
        item.remote_id = data['id']

        # Skip if no resource
        if not len(data.get('resources', [])):
            msg = 'Dataset {0} has no record'.format(item.remote_id)
            raise HarvestSkipException(msg)

        dataset = self.get_dataset(item.remote_id)

        # Core attributes
        if not dataset.slug:
            dataset.slug = data['name']
        dataset.title = data['title']
        dataset.description = parse_html(data['notes'])

        # Detect Org
        organization_acronym = data['organization']['name']
        orgObj = Organization.objects(acronym=organization_acronym).first()
        if orgObj:
            #print 'Found %s' % orgObj.acronym
            dataset.organization = orgObj
        else:
            orgObj = Organization()
            orgObj.acronym = organization_acronym
            orgObj.name = data['organization']['title']
            orgObj.description = data['organization']['description']
            orgObj.save()
            #print 'Created %s' % orgObj.acronym

            dataset.organization = orgObj

        # Detect license
        default_license = self.harvest_config.get('license', License.default())
        dataset.license = License.guess(data['license_id'],
                                        data['license_title'],
                                        default=default_license)

        dataset.tags = [t['name'] for t in data['tags'] if t['name']]

        dataset.tags.append(urlparse(self.source.url).hostname)

        dataset.created_at = data['metadata_created']
        dataset.last_modified = data['metadata_modified']

        dataset.frequency = 'unknown'
        dataset.extras['ckan:name'] = data['name']

        temporal_start, temporal_end = None, None
        spatial_geom = None

        for extra in data['extras']:
            # GeoJSON representation (Polygon or Point)
            if extra['key'] == 'spatial':
                spatial_geom = json.loads(extra['value'])
            #  Textual representation of the extent / location
            elif extra['key'] == 'spatial-text':
                log.debug('spatial-text value not handled')
            # Linked Data URI representing the place name
            elif extra['key'] == 'spatial-uri':
                log.debug('spatial-uri value not handled')
            # Update frequency
            elif extra['key'] == 'frequency':
                print 'frequency', extra['value']
            # Temporal coverage start
            elif extra['key'] == 'temporal_start':
                temporal_start = daterange_start(extra['value'])
                continue
            # Temporal coverage end
            elif extra['key'] == 'temporal_end':
                temporal_end = daterange_end(extra['value'])
                continue
            dataset.extras[extra['key']] = extra['value']

        # We don't want spatial to be added on harvester
        if self.harvest_config.get('geozones', False):
            dataset.spatial = SpatialCoverage()
            dataset.spatial.zones = []
            for zone in self.harvest_config.get('geozones'):
                geo_zone = GeoZone.objects.get(id=zone)
                dataset.spatial.zones.append(geo_zone)
        #
        # if spatial_geom:
        #     dataset.spatial = SpatialCoverage()
        #     if spatial_geom['type'] == 'Polygon':
        #         coordinates = [spatial_geom['coordinates']]
        #     elif spatial_geom['type'] == 'MultiPolygon':
        #         coordinates = spatial_geom['coordinates']
        #     else:
        #         HarvestException('Unsupported spatial geometry')
        #     dataset.spatial.geom = {
        #         'type': 'MultiPolygon',
        #         'coordinates': coordinates
        #     }

        if temporal_start and temporal_end:
            dataset.temporal_coverage = db.DateRange(
                start=temporal_start,
                end=temporal_end,
            )

        # Remote URL
        if data.get('url'):
            try:
                url = uris.validate(data['url'])
            except uris.ValidationError:
                dataset.extras['remote_url'] = self.dataset_url(data['name'])
                dataset.extras['ckan:source'] = data['url']
            else:
                dataset.extras['remote_url'] = url

        dataset.extras['harvest:name'] = self.source.name

        current_resources = [
            str(resource.id) for resource in dataset.resources
        ]
        fetched_resources = []
        # Resources
        for res in data['resources']:
            if res['resource_type'] not in ALLOWED_RESOURCE_TYPES:
                continue

            #Ignore invalid Resources
            try:
                url = uris.validate(res['url'])
            except uris.ValidationError:
                continue

            try:
                resource = get_by(dataset.resources, 'id', UUID(res['id']))
            except Exception:
                log.error('Unable to parse resource ID %s', res['id'])
                continue

            fetched_resources.append(str(res['id']))
            if not resource:
                resource = Resource(id=res['id'])
                dataset.resources.append(resource)
            resource.title = res.get('name', '') or ''
            resource.description = parse_html(res.get('description'))
            resource.url = res['url']
            resource.filetype = 'remote'
            resource.format = res.get('format')
            resource.mime = res.get('mimetype')
            resource.hash = res.get('hash')
            resource.created = res['created']
            resource.modified = res['last_modified']
            resource.published = resource.published or resource.created

        # Clean up old resources removed from source
        for resource_id in current_resources:
            if resource_id not in fetched_resources:
                try:
                    resource = get_by(dataset.resources, 'id',
                                      UUID(resource_id))
                except Exception:
                    log.error('Unable to parse resource ID %s', resource_id)
                    continue
                else:
                    if resource and not self.dryrun:
                        dataset.resources.remove(resource)

        return dataset
Example #36
0
    def serialize(cls, dataset):
        organization = None
        owner = None
        image_url = None
        spatial_weight = DEFAULT_SPATIAL_WEIGHT
        temporal_weight = DEFAULT_TEMPORAL_WEIGHT

        if dataset.organization:
            organization = Organization.objects(
                id=dataset.organization.id).first()
            image_url = organization.logo(40, external=True)
        elif dataset.owner:
            owner = User.objects(id=dataset.owner.id).first()
            image_url = owner.avatar(40, external=True)

        certified = organization and organization.certified

        document = {
            'title':
            dataset.title,
            'description':
            dataset.description,
            'license':
            getattr(dataset.license, 'id', None),
            'tags':
            dataset.tags,
            'badges': [badge.kind for badge in dataset.badges],
            'tag_suggest':
            dataset.tags,
            'resources': [{
                'title': r.title,
                'description': r.description,
                'format': r.format,
            } for r in dataset.resources],
            'format_suggest':
            [r.format.lower() for r in dataset.resources if r.format],
            'frequency':
            dataset.frequency,
            'organization':
            str(organization.id) if organization else None,
            'owner':
            str(owner.id) if owner else None,
            'dataset_suggest': {
                'input': cls.completer_tokenize(dataset.title) + [dataset.id],
                'output': dataset.title,
                'payload': {
                    'id': str(dataset.id),
                    'slug': dataset.slug,
                    'acronym': dataset.acronym,
                    'image_url': image_url,
                },
            },
            'created':
            dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified':
            dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'),
            'metrics':
            dataset.metrics,
            'featured':
            dataset.featured,
            'from_certified':
            certified,
        }
        if (dataset.temporal_coverage is not None
                and dataset.temporal_coverage.start
                and dataset.temporal_coverage.end):
            start = dataset.temporal_coverage.start.toordinal()
            end = dataset.temporal_coverage.end.toordinal()
            temporal_weight = min((end - start) / 365, MAX_TEMPORAL_WEIGHT)
            document.update({
                'temporal_coverage': {
                    'start': start,
                    'end': end
                },
                'temporal_weight': temporal_weight,
            })

        if dataset.spatial is not None:
            # Index precise zone labels and parents zone identifiers
            # to allow fast filtering.
            zone_ids = [z.id for z in dataset.spatial.zones]
            zones = GeoZone.objects(id__in=zone_ids).exclude('geom')
            parents = set()
            geozones = []
            coverage_level = ADMIN_LEVEL_MAX
            for zone in zones:
                geozones.append({
                    'id': zone.id,
                    'name': zone.name,
                    'keys': zone.keys_values
                })
                parents |= set(zone.parents)
                coverage_level = min(coverage_level, admin_levels[zone.level])

            geozones.extend([{'id': p} for p in parents])

            spatial_weight = ADMIN_LEVEL_MAX / coverage_level
            document.update({
                'geozones': geozones,
                'granularity': dataset.spatial.granularity,
                'spatial_weight': spatial_weight,
            })

        document['dataset_suggest']['weight'] = cls.get_suggest_weight(
            temporal_weight, spatial_weight, dataset.featured)

        if dataset.acronym:
            document['dataset_suggest']['input'].append(dataset.acronym)

        return document
Example #37
0
def inject_organization_needs(sender, identity):
    if current_user.is_authenticated():
        for org in Organization.objects(members__user=current_user.id):
            membership = get_by(org.members, 'user', current_user._get_current_object())
            identity.provides.add(OrganizationNeed(membership.role, org.id))
Example #38
0
    def serialize(cls, dataset):
        organization = None
        owner = None

        if dataset.organization:
            org = Organization.objects(id=dataset.organization.id).first()
            organization = {
                'id': str(org.id),
                'name': org.name,
                'public_service': 1 if org.public_service else 0,
                'followers': org.metrics.get('followers', 0)
            }
        elif dataset.owner:
            owner = User.objects(id=dataset.owner.id).first()

        document = {
            'id': str(dataset.id),
            'title': dataset.title,
            'description': dataset.description,
            'acronym': dataset.acronym or None,
            'url': dataset.display_url,
            'tags': dataset.tags,
            'license': getattr(dataset.license, 'id', None),
            'badges': [badge.kind for badge in dataset.badges],
            'frequency': dataset.frequency,
            'created_at': to_iso_datetime(dataset.created_at),
            'views': dataset.metrics.get('views', 0),
            'followers': dataset.metrics.get('followers', 0),
            'reuses': dataset.metrics.get('reuses', 0),
            'featured': 1 if dataset.featured else 0,
            'resources_count': len(dataset.resources),
            'organization': organization,
            'owner': str(owner.id) if owner else None,
            'format':
            [r.format.lower() for r in dataset.resources if r.format],
            'schema':
            [r.schema.get('name') for r in dataset.resources if r.schema]
        }
        extras = {}
        for key, value in dataset.extras.items():
            extras[key] = to_iso_datetime(value) if isinstance(
                value, datetime.datetime) else value
        document.update({'extras': extras})

        if (dataset.temporal_coverage is not None
                and dataset.temporal_coverage.start
                and dataset.temporal_coverage.end):
            start = to_iso_datetime(dataset.temporal_coverage.start)
            end = to_iso_datetime(dataset.temporal_coverage.end)
            document.update({
                'temporal_coverage_start': start,
                'temporal_coverage_end': end,
            })

        if dataset.spatial is not None:
            # Index precise zone labels and parents zone identifiers
            # to allow fast filtering.
            zone_ids = [z.id for z in dataset.spatial.zones]
            zones = GeoZone.objects(id__in=zone_ids).exclude('geom')
            parents = set()
            geozones = []
            coverage_level = ADMIN_LEVEL_MAX
            for zone in zones:
                geozones.append({
                    'id': zone.id,
                    'name': zone.name,
                    'keys': zone.keys_values
                })
                parents |= set(zone.parents)
                coverage_level = min(coverage_level, admin_levels[zone.level])

            geozones.extend([{'id': p} for p in parents])
            document.update({
                'geozones': geozones,
                'granularity': dataset.spatial.granularity,
            })
        return document
Example #39
0
 def get_value(self):
     org = (Organization.objects(metrics__datasets__gt=0).visible()
            .order_by('-metrics.datasets').first())
     return org.metrics['datasets'] if org else 0